From aa5e36456a5720fc341f59ca3fbd913c10f501e7 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Tue, 12 Sep 2023 07:37:45 -0700
Subject: [PATCH 01/34] [TRT EP] Fix multithreading bug of getting the
 corrupted trt engine instance (#17507)

Revert to the old TRT EP behavior of securing the whole compute_function
by lock_guard.

Current TRT EP which only puts lock_guard around a critical section
(obvious wrong) inside compute_function.
The issue can happen where one thread is updating the engine in
compute_function whereas another thread still accesses the
stale/corrupted engine instance in compute_function, for example, the
code outside the critical section, `int total_bindings =
trt_engine->getNbBindings()`.

So, make the whole compute_function the critical section should be okay.
---
 .../tensorrt/tensorrt_execution_provider.cc   | 402 +++++++++---------
 1 file changed, 200 insertions(+), 202 deletions(-)
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index e90417a6d14fc..96893f63b4540 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2433,6 +2433,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       Ort::KernelContext ctx(context);
 
       TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
+
+      // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
+      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
+      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+      std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
       const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
       const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
       const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
@@ -2475,237 +2480,230 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         timing_cache_path = GetTimingCachePath(cache_path_, prop);
       }
 
-      // Following block is the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
-      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
-      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-      {
-        std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
-
-        // Load serialized engine
-        if (trt_state->engine_cache_enable && trt_engine == nullptr) {
-          std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
-          std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
-          if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
-            // Deserialize profile
-            shape_ranges = DeserializeProfileV2(profile_file);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-
-            // Prepare buffer
-            engine_file.seekg(0, std::ios::end);
-            size_t engine_size = engine_file.tellg();
-            engine_file.seekg(0, std::ios::beg);
-            std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-            engine_file.read((char*)engine_buf.get(), engine_size);
-
-            // Deserialize engine
-            // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-            // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-            trt_state->engine->reset();
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-                trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
-            if (*(trt_state->engine) == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
-            }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
-            trt_engine = trt_state->engine->get();
-            context_update = true;
-          } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
-            shape_ranges = DeserializeProfileV2(profile_file);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-            // Decrypt engine
-            size_t engine_size = 0;
-            if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not get engine buffer size");
-            }
-            std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-            if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not call engine decryption function decrypt");
-            }
-            // Deserialize engine
-            // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-            // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-            trt_state->engine->reset();
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
-            if (*(trt_state->engine) == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
-            }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
-            trt_engine = trt_state->engine->get();
-            context_update = true;
+      // Load serialized engine
+      if (trt_state->engine_cache_enable && trt_engine == nullptr) {
+        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
+        std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
+        if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
+          // Deserialize profile
+          shape_ranges = DeserializeProfileV2(profile_file);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+
+          // Prepare buffer
+          engine_file.seekg(0, std::ios::end);
+          size_t engine_size = engine_file.tellg();
+          engine_file.seekg(0, std::ios::beg);
+          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+          engine_file.read((char*)engine_buf.get(), engine_size);
+
+          // Deserialize engine
+          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+          trt_state->engine->reset();
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+              trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          if (*(trt_state->engine) == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+          }
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
+          trt_engine = trt_state->engine->get();
+          context_update = true;
+        } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
+          shape_ranges = DeserializeProfileV2(profile_file);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+          // Decrypt engine
+          size_t engine_size = 0;
+          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not get engine buffer size");
+          }
+          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not call engine decryption function decrypt");
+          }
+          // Deserialize engine
+          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+          trt_state->engine->reset();
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          if (*(trt_state->engine) == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
           }
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
+          trt_engine = trt_state->engine->get();
+          context_update = true;
         }
+      }
 
-        // Check and update shape ranges for dynamic shape inputs.
-        for (int i = 0, end = num_inputs; i < end; ++i) {
-          auto input = trt_state->network->get()->getInput(i);
-          const std::string& input_name = input->getName();
-          input_names.insert(input_name);
+      // Check and update shape ranges for dynamic shape inputs.
+      for (int i = 0, end = num_inputs; i < end; ++i) {
+        auto input = trt_state->network->get()->getInput(i);
+        const std::string& input_name = input->getName();
+        input_names.insert(input_name);
 
-          // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
-          // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
-          if (shape_ranges.find(input_name) != shape_ranges.end()) {
-            auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
-            if (status != Status::OK()) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
-            }
+        // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
+        // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
+        if (shape_ranges.find(input_name) != shape_ranges.end()) {
+          auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
           }
         }
+      }
 
-        // Regenerate engine
-        if (engine_update) {
-          // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
-          if (GetPerThreadContext().IsTensorRTContextInMap(fused_node_name)) {
-            GetPerThreadContext().ResetTensorRTContext(fused_node_name);
-          }
+      // Regenerate engine
+      if (engine_update) {
+        // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
+        if (GetPerThreadContext().IsTensorRTContextInMap(fused_node_name)) {
+          GetPerThreadContext().ResetTensorRTContext(fused_node_name);
+        }
 
-          trt_state->engine->reset();
-          auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-          trt_config->setMaxWorkspaceSize(*(trt_state->max_workspace_size_ptr));
-          for (auto trt_profile : trt_profiles) {
-            trt_config->addOptimizationProfile(trt_profile);
-          }
+        trt_state->engine->reset();
+        auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+        trt_config->setMaxWorkspaceSize(*(trt_state->max_workspace_size_ptr));
+        for (auto trt_profile : trt_profiles) {
+          trt_config->addOptimizationProfile(trt_profile);
+        }
 
-          // Set INT8 Per Tensor Dynamic range
-          if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
-            trt_config->setInt8Calibrator(nullptr);
-            if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
-            }
+        // Set INT8 Per Tensor Dynamic range
+        if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+          trt_config->setInt8Calibrator(nullptr);
+          if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
           }
+        }
 
-          // Set precision
-          if (trt_state->fp16_enable && trt_state->int8_enable) {
-            trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
-          } else if (trt_state->fp16_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-          } else if (trt_state->int8_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-          }
+        // Set precision
+        if (trt_state->fp16_enable && trt_state->int8_enable) {
+          trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
+        } else if (trt_state->fp16_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+        } else if (trt_state->int8_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+        }
 
-          // Set DLA (DLA can only run with FP16 or INT8)
-          if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
-            trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-            trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-            trt_config->setDLACore(trt_state->dla_core);
-          }
+        // Set DLA (DLA can only run with FP16 or INT8)
+        if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
+          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+          trt_config->setDLACore(trt_state->dla_core);
+        }
 
-          // enable sparse weights
-          if (trt_state->sparsity_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
-          }
+        // enable sparse weights
+        if (trt_state->sparsity_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
+        }
 
-          // enable builder heuristics
-          if (trt_state->build_heuristics_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
-          }
+        // enable builder heuristics
+        if (trt_state->build_heuristics_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
+        }
 #if NV_TENSORRT_MINOR > 5 && NV_TENSORRT_MAJOR >= 8
-          // switch optimizaion level
-          if (trt_state->builder_optimization_level != 3) {
-            trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
-          }
+        // switch optimizaion level
+        if (trt_state->builder_optimization_level != 3) {
+          trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
+        }
 
-          // limit auxiliary streams
-          if (trt_state->auxiliary_streams >= 0) {
-            trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
-          }
+        // limit auxiliary streams
+        if (trt_state->auxiliary_streams >= 0) {
+          trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
+        }
 #else
-          if (trt_state->builder_optimization_level != 3) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
-          }
-          if (trt_state->auxiliary_streams >= 0) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
-          }
+        if (trt_state->builder_optimization_level != 3) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
+        }
+        if (trt_state->auxiliary_streams >= 0) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
+        }
 #endif
-          // limit used tactic sources
-          if (trt_state->filter_tactic_sources) {
-            nvinfer1::TacticSources tactics = trt_config->getTacticSources();
-            tactics |= trt_state->tactic_sources;
-            trt_config->setTacticSources(tactics);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+        // limit used tactic sources
+        if (trt_state->filter_tactic_sources) {
+          nvinfer1::TacticSources tactics = trt_config->getTacticSources();
+          tactics |= trt_state->tactic_sources;
+          trt_config->setTacticSources(tactics);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+        }
+
+        // Load timing cache from file. Create a fresh cache if the file doesn't exist
+        std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+        if (trt_state->timing_cache_enable) {
+          std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+          timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+          if (timing_cache == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not create timing cache: " + timing_cache_path);
           }
-
-          // Load timing cache from file. Create a fresh cache if the file doesn't exist
-          std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
-          if (trt_state->timing_cache_enable) {
-            std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
-            timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
-            if (timing_cache == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not create timing cache: " + timing_cache_path);
-            }
-            trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
-            }
+          trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
           }
+        }
 
-          // Build engine
-          {
-            auto lock = GetApiLock();
-            std::chrono::steady_clock::time_point engine_build_start;
-            if (detailed_build_log_) {
-              engine_build_start = std::chrono::steady_clock::now();
-            }
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-                trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
-            if (detailed_build_log_) {
-              auto engine_build_stop = std::chrono::steady_clock::now();
-              LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
-            }
-          }
-          if (*(trt_state->engine) == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        // Build engine
+        {
+          auto lock = GetApiLock();
+          std::chrono::steady_clock::time_point engine_build_start;
+          if (detailed_build_log_) {
+            engine_build_start = std::chrono::steady_clock::now();
           }
-          trt_engine = trt_state->engine->get();
-          if (trt_state->engine_cache_enable) {
-            // Serialize engine profile
-            SerializeProfileV2(profile_cache_path, shape_ranges);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
-
-            // Serialize engine
-            std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
-            size_t engine_size = serializedModel->size();
-            if (trt_state->engine_decryption_enable) {
-              // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
-              if (trt_state->engine_encryption != nullptr) {
-                if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
-                  return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                         "TensorRT EP could not call engine encryption function encrypt");
-                }
-                LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
-              } else {
-                LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+              trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
+          if (detailed_build_log_) {
+            auto engine_build_stop = std::chrono::steady_clock::now();
+            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+          }
+        }
+        if (*(trt_state->engine) == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        }
+        trt_engine = trt_state->engine->get();
+        if (trt_state->engine_cache_enable) {
+          // Serialize engine profile
+          SerializeProfileV2(profile_cache_path, shape_ranges);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+
+          // Serialize engine
+          std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
+          size_t engine_size = serializedModel->size();
+          if (trt_state->engine_decryption_enable) {
+            // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
+            if (trt_state->engine_encryption != nullptr) {
+              if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                       "TensorRT EP could not call engine encryption function encrypt");
               }
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
             } else {
-              std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
-              file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+              LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
             }
+          } else {
+            std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
+            file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
           }
+        }
 
-          // serialize and save timing cache
-          if (trt_state->timing_cache_enable) {
-            auto timing_cache = trt_config->getTimingCache();
-            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
-            if (timingCacheHostData == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not serialize timing cache: " + timing_cache_path);
-            }
-            saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
-            }
+        // serialize and save timing cache
+        if (trt_state->timing_cache_enable) {
+          auto timing_cache = trt_config->getTimingCache();
+          std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+          if (timingCacheHostData == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+          }
+          saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
           }
-          context_update = true;
         }
+        context_update = true;
       }
 
       // Build execution context if either of the following conditions is true:

From cf672c5887b4e5991b022df38fbb8d61f29fe420 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 12 Sep 2023 10:56:35 -0700
Subject: [PATCH 02/34] Use name of temporary provisioning profile. (#17459)

The old provisioning profile no longer works. Switched to a temporary one that we can use before a new one is available. The temporary one has a different name.
---
 .../templates/stages/mac-ios-packaging-build-stage.yml         | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index 2484facfae33e..81f17a26b16a6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -116,7 +116,8 @@ stages:
         xcodeDeveloperDir: '/Applications/Xcode_${{ variables.xcodeVersion }}.app/Contents/Developer'
         signingOption: 'manual'
         signingIdentity: '$(APPLE_CERTIFICATE_SIGNING_IDENTITY)'
-        provisioningProfileName: 'iOS Team Provisioning Profile'
+        provisioningProfileName: 'temporary *'  # temporary name, change it back to the original below later
+        #provisioningProfileName: 'iOS Team Provisioning Profile'
         args: '-derivedDataPath $(Build.BinariesDirectory)/app_center_test/ios_package_test/DerivedData'
         workingDirectory: '$(Build.BinariesDirectory)/app_center_test/ios_package_test/'
         useXcpretty: false  # xcpretty can hide useful error output so we will disable it

From 49511b548371b2af4d9c2d186204dd85dc0a1a7e Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 12 Sep 2023 11:38:36 -0700
Subject: [PATCH 03/34] Improve performance of prune_graph in onnx_model.py
 (#17502)

During optimization of SDXL UNet, the prune_graph takes up to 5 minutes.
The cause is to find a node in all nodes is time-consuming. This
optimization will reduce the latency of prune_graph to 2 seconds.

New algorithm will use a hash table (key is first node output, value is
node) to speed up.
---
 .../python/tools/transformers/onnx_model.py   | 78 ++++++++++++-------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 8c836db7b9ef6..60be2d84b2bc8 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -816,51 +816,77 @@ def prune_graph(self, outputs=None, allow_remove_graph_inputs=True):
         """
 
         if len(self.graphs()) > 1:
+            # TODO(tianleiwu): handle subgraph
             logger.debug("Skip prune_graph since graph has subgraph")
             return
 
-        if outputs is None:
-            outputs = [output.name for output in self.model.graph.output]
+        keep_outputs = [output.name for output in self.model.graph.output] if outputs is None else outputs
 
         output_name_to_node = self.output_name_to_node()
-        all_nodes = []
-        for output in outputs:
-            if output in output_name_to_node:
-                last_node = output_name_to_node[output]
-                if last_node in all_nodes:
-                    continue
-                nodes = self.get_parent_subgraph_nodes(last_node, [])
-                all_nodes.append(last_node)
-                all_nodes.extend(nodes)
 
-        nodes_to_remove = [node for node in self.model.graph.node if node not in all_nodes]
+        def get_first_output(node):
+            if node.output[0]:
+                return node.output[0]
+            return next(iter([o for o in node.output if o]), None)
 
-        self.remove_nodes(nodes_to_remove)
+        # Keep track of nodes to keep. The key is first output of node, and the value is the node.
+        output_to_node = {}
 
-        # remove outputs not in list
-        output_to_remove = []
-        for output in self.model.graph.output:
-            if output.name not in outputs:
-                output_to_remove.append(output)
-        for output in output_to_remove:
-            self.model.graph.output.remove(output)
+        # Start from graph outputs, and find parent nodes recurisvely, and add nodes to the output_to_node dictionary.
+        dq = deque()
+        for output in keep_outputs:
+            if output in output_name_to_node:
+                dq.append(output_name_to_node[output])
+        while len(dq) > 0:
+            node = dq.pop()
+            first_output = get_first_output(node)
+            if first_output and (first_output not in output_to_node):
+                output_to_node[first_output] = node
+                for name in node.input:
+                    if len(name) > 0 and (name in output_name_to_node) and (name not in output_to_node):
+                        dq.appendleft(output_name_to_node[name])
+
+        # Keep only those nodes in the output_to_node dictionary.
+        nodes_to_keep = []
+        num_nodes_removed = 0
+        for node in self.model.graph.node:
+            first_output = get_first_output(node)
+            kept_node = output_to_node[first_output] if first_output in output_to_node else None
 
-        # remove inputs not used by any node.
+            # Need double check the node since fused node might reuse output name of some nodes to be removed.
+            # It is slow to compare whole node, so we compare op_type first to avoid comparing node in most cases.
+            if kept_node and kept_node.op_type == node.op_type and kept_node == node:
+                nodes_to_keep.append(node)
+            else:
+                num_nodes_removed += 1
+        self.model.graph.ClearField("node")
+        self.model.graph.node.extend(nodes_to_keep)
+
+        # Remove graph outputs not in list
+        output_to_remove = []
+        if outputs is not None:
+            for output in self.model.graph.output:
+                if output.name not in outputs:
+                    output_to_remove.append(output)
+            for output in output_to_remove:
+                self.model.graph.output.remove(output)
+
+        # Remove graph inputs not used by any node.
         input_to_remove = []
         if allow_remove_graph_inputs:
             input_name_to_nodes = self.input_name_to_nodes()
             input_to_remove = [input for input in self.model.graph.input if input.name not in input_name_to_nodes]
-            for input in input_to_remove:
-                self.model.graph.input.remove(input)
+            for name in input_to_remove:
+                self.model.graph.input.remove(name)
 
-        if input_to_remove or output_to_remove or nodes_to_remove:
+        if input_to_remove or output_to_remove or num_nodes_removed > 0:
             removed = []
             if input_to_remove:
                 removed.append(f"{len(input_to_remove)} inputs")
             if output_to_remove:
                 removed.append(f"{len(output_to_remove)} outputs")
-            if nodes_to_remove:
-                removed.append(f"{len(nodes_to_remove)} nodes")
+            if num_nodes_removed > 0:
+                removed.append(f"{num_nodes_removed} nodes")
             logger.info("Removed %s", ", ".join(removed))
 
         self.update_graph()

From f923eec28b6b4237d6fc251042893deab335d97e Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 12 Sep 2023 16:59:13 -0700
Subject: [PATCH 04/34] [js/web] release session after use in npm test (#17470)

### Description
release session after use in npm test.

This is one of the prerequisites for supporting IO binding for WebGPU
buffer in onnxruntime-web.

list of prerequisites PRs:
#17465
#17469
#17470 (this one)
---
 js/web/test/test-main.ts   | 4 ++--
 js/web/test/test-runner.ts | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index e614cc8e67e71..49d0ac225be2f 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -110,9 +110,9 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
               test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
         });
 
-        after('release session', () => {
+        after('release session', async () => {
           if (context) {
-            context.release();
+            await context.release();
           }
         });
 
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 9802f00f7a866..46d80a9f56f35 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -210,11 +210,12 @@ export class ModelTestContext {
     Logger.verbose('TestRunner.Perf', '***Perf Data End');
   }
 
-  release(): void {
+  async release(): Promise<void> {
     if (this.profile) {
       this.session.endProfiling();
     }
     this.logPerfData();
+    await this.session.release();
   }
 
   /**

From 9b755dce9f6b39296a716028f52347de037daa2e Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 12 Sep 2023 17:40:49 -0700
Subject: [PATCH 05/34] Delete all Prefast tasks (#17522)

### Description
Delete all Prefast tasks because the new VS 17.7 version crashes every
time when we run the task on our CI build servers. However, we cannot
reproduce it locally. And this problem blocks us installing security
patches to our CI build machines.

Will use [CodeQL](https://codeql.github.com/) instead.

### Motivation and Context
Address some security alerts.
---
 .github/workflows/sca.yml                     | 133 ------------------
 .../azure-pipelines/post-merge-jobs.yml       |   6 -
 .../azure-pipelines/templates/compliance.yml  |  21 ---
 .../templates/jobs/win-ci-vs-2022-job.yml     |  48 -------
 .../templates/py-packaging-stage.yml          |  18 ---
 .../azure-pipelines/templates/py-win-gpu.yml  |  59 --------
 .../azure-pipelines/templates/win-ci.yml      |  19 ---
 .../azure-pipelines/win-ci-pipeline.yml       |  10 --
 .../azure-pipelines/win-gpu-ci-pipeline.yml   |   4 -
 9 files changed, 318 deletions(-)
 delete mode 100644 .github/workflows/sca.yml

diff --git a/.github/workflows/sca.yml b/.github/workflows/sca.yml
deleted file mode 100644
index 1416f5a4d33a9..0000000000000
--- a/.github/workflows/sca.yml
+++ /dev/null
@@ -1,133 +0,0 @@
-name: Windows_SCA
-on:
-  push:
-    branches:
-      - main
-      - rel-*
-  pull_request:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  AZCOPY_AUTO_LOGIN_TYPE: MSI
-  AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-
-jobs:
-  Onnxruntime-SCA-training-CUDA:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Download cuda
-        run: azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v11.8" cuda_sdk
-
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run: python tools\ci_build\build.py --windows_sdk_version 10.0.22621.0 --enable_training --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_pybind --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --use_cuda --cuda_home=${{ github.workspace }}\cuda_sdk\v11.8 --enable_cuda_profiling  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA
-
-  # No python
-  Onnxruntime-SCA-win32-WINML-x64:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X64
-
-  # No java, No python
-  Onnxruntime-SCA-win32-WINML-x86:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x86'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x86 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X86
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 113b24f7579ac..61f9b37d4ce78 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -74,7 +74,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-MultiA10
@@ -95,7 +94,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: TRT
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-MultiA10
@@ -114,7 +112,6 @@ stages:
       isX86: false
       job_name_suffix: x64_mimalloc
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -134,7 +131,6 @@ stages:
       isX86: false
       job_name_suffix: x64_no_memory_profiling
       RunOnnxRuntimeTests: false
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -154,7 +150,6 @@ stages:
       isX86: false
       job_name_suffix: x64_minimal_no_exception
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -174,7 +169,6 @@ stages:
       isX86: false
       job_name_suffix: x64_debug_node_input_output
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
diff --git a/tools/ci_build/github/azure-pipelines/templates/compliance.yml b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
index f4bce8c53605b..cc451425be42a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/compliance.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
@@ -18,27 +18,6 @@ steps:
     AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
   continueOnError: true
 
-- task: DeleteFiles@1
-  displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
-  inputs:
-    SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-    Contents: |
-     **/*.obj
-     **/*.pdb
-     **/*.dll
-
-# Manually set msBuildCommandline so that we can also set CAExcludePath
-- task: SDLNativeRules@3
-  displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-  inputs:
-    userProvideBuildInfo: msBuildInfo
-    msBuildArchitecture: x64
-    msBuildVersion: 17.0
-    msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln" /p:platform="${{parameters.msbuildPlatform}}" /p:configuration="RelWithDebInfo" /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-    excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-    rulesetName: Custom
-    customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-
 - task: SdtReport@2
   displayName: 'Create Security Analysis Report'
   inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index 67a03beab9362..46f2ae7b97acc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -34,11 +34,6 @@ parameters:
   type: boolean
   default: true
 
-- name: RunStaticCodeAnalysis
-  displayName: Run Static Code Analysis
-  type: boolean
-  default: true
-
 - name: ORT_EP_NAME
   type: string
 
@@ -220,49 +215,6 @@ jobs:
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Run tests'
 
-
-  - ${{ if eq(parameters.RunStaticCodeAnalysis, true) }}:
-      - task: DeleteFiles@1
-        displayName: 'Delete binaries files from $(Build.BinariesDirectory)\RelWithDebInfo'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-          Contents: |
-           **/*.obj
-           **/*.pdb
-           **/*.dll
-
-
-      # Manually set msBuildCommandline so that we can also set CAExcludePath
-      # build_dir must be a sub folder of $(Build.SourcesDirectory)
-      # TODO: move this step to a CPU-only machine to save GPU resources.
-      - task: SDLNativeRules@3
-        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-        inputs:
-          msBuildArchitecture: amd64
-          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-          msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\RelWithDebInfo\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=${{ parameters.msbuildPlatform }} /p:configuration=RelWithDebInfo /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-          rulesetName: Custom
-          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-          publishXML: true
-
-      - task: SdtReport@2
-        displayName: 'Create Security Analysis Report'
-        inputs:
-          SDLNativeRules: true
-
-      - task: PublishSecurityAnalysisLogs@3
-        displayName: 'Publish Security Analysis Logs'
-        continueOnError: true
-
-      - task: PostAnalysis@2
-        displayName: 'Guardian Break v2'
-        inputs:
-          GdnBreakGdnToolSDLNativeRulesSeverity: Note
-          GdnBreakGdnToolSDLNativeRules: true
-
-
-  - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
         inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 8812d4ed91ae7..1305f5ae21725 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -246,24 +246,6 @@ stages:
         workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
         displayName: 'Run Python Tests'
 
-      #Skip it for 32 bits x86 build. Currently the scan tool has a bug: it doesn't allow me use 64 bits link.exe
-      #in 32 bits Win32 build. I tried all the settings but they all don't work.
-      - task: SDLNativeRules@3
-        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-        condition: and (succeeded(), and(eq(variables['buildArch'], 'x64'), eq(variables['PythonVersion'], '3.8')))
-        inputs:
-          msBuildArchitecture: amd64
-          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --enable_pybind --enable_onnx_tests --parallel $(TelemetryOption) --update --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-          msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\Debug\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform="$(MsbuildPlatform)" /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-          rulesetName: Custom
-          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-
-      - task: SdtReport@2
-        displayName: 'Create Security Analysis Report'
-        inputs:
-          SDLNativeRules: true
-
       - task: TSAUpload@2
         displayName: 'TSA upload'
         condition: and(and (succeeded(), and(eq(variables['buildArch'], 'x64'), eq(variables['PythonVersion'], '3.8'))), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index ef938a634554a..919749cac15b6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -22,65 +22,6 @@ parameters:
   default: ''
 
 jobs:
-- ${{ if eq(parameters.PYTHON_VERSION, '3.8') }}:
-    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_StaticAnalysis
-      timeoutInMinutes: 240
-      workspace:
-        clean: all
-      pool: onnxruntime-Win-CPU-2022
-      steps:
-          - checkout: self
-            clean: true
-            submodules: none
-          - task: UsePythonVersion@0
-            inputs:
-              versionSpec: 3.8
-              addToPath: true
-              architecture: 'x64'       
-          - task: onebranch.pipeline.tsaoptions@1
-            displayName: 'OneBranch TSAOptions'
-            inputs:
-              tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-              appendSourceBranchName: false
-              
-          - template: download-deps.yml
-
-          - template: jobs/set-winenv.yml
-            parameters:
-              EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
-              DownloadCUDA: true
-
-          - task: PythonScript@0
-            displayName: 'Update deps.txt'
-            inputs:
-              scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
-              arguments: --new_dir $(Build.BinariesDirectory)/deps
-              workingDirectory: $(Build.BinariesDirectory)
-
-          - task: SDLNativeRules@3
-            displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-            inputs:
-              msBuildArchitecture: amd64
-              setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --enable_pybind ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} --update --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-              msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\Debug\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=x64 /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-              excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-              rulesetName: Custom
-              customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-              publishXML: true
-
-          - task: SdtReport@2
-            displayName: 'Create Security Analysis Report'
-            inputs:
-              SDLNativeRules: true
-
-          - task: TSAUpload@2
-            displayName: 'TSA upload'
-            condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-            inputs:
-              GdnPublishTsaOnboard: false
-              GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
-
- 
 - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
   timeoutInMinutes: 240
   workspace:
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index f6da7bb857b7d..80d285f3fd3fb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -263,25 +263,6 @@ stages:
             AnalyzeTargetGlob: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\**\*.dll'
           continueOnError: true
 
-        - task: DeleteFiles@1
-          displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
-          inputs:
-            SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-            Contents: |
-             **/*.obj
-             **/*.pdb
-             **/*.dll
-
-        #Manually set msBuildCommandline so that we can also set CAExcludePath
-        - task: SDLNativeRules@3
-          displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-          condition: and (succeeded(), eq(variables['msbuildPlatform'], 'x64'))
-          inputs:
-            msBuildArchitecture: amd64
-            setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON'
-            msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\Debug\onnxruntime.sln" /p:platform="$(MsbuildPlatform)" /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-            excludedPaths: '$(Build.BinariesDirectory)#$(Build.SourcesDirectory)\cmake#C:\program files (x86)'
-
         - task: PostAnalysis@2
           inputs:
             GdnBreakAllTools: false
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index b9b833a3155bf..2a5622faf2905 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -47,7 +47,6 @@ stages:
         isX86: false
         job_name_suffix: x64_debug
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -69,7 +68,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -89,7 +87,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: DNNL
         GenerateDocumentation: false
@@ -111,7 +108,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: XNNPACK
         GenerateDocumentation: false
@@ -132,7 +128,6 @@ stages:
         job_name_suffix: x64_release_winml
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         # WinML has many warnings
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: false
         ORT_EP_NAME: CPU
@@ -153,7 +148,6 @@ stages:
         isX86: true
         job_name_suffix: x86_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -173,7 +167,6 @@ stages:
         isX86: false
         job_name_suffix: training_x64_debug
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -193,7 +186,6 @@ stages:
         isX86: false
         job_name_suffix: training_x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: true
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -213,7 +205,6 @@ stages:
         isX86: false
         job_name_suffix: ort_training_apis_x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: true
         ORT_EP_NAME: CPU
@@ -234,7 +225,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release_azure
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: false
         ORT_EP_NAME: CPU
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index 69e71c1266664..8796917afa37d 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -47,7 +47,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-A10
@@ -65,7 +64,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         # Some unit tests crash on A10 GPUs. So this job still needs to use T4.
@@ -85,7 +83,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: DML
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-dml-A10
@@ -104,7 +101,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: false
-        RunStaticCodeAnalysis: false
         GenerateDocumentation: true
         ORT_EP_NAME: CUDA # It doesn't really matter which EP is selected here since this stage is for documentation.
         WITH_CACHE: true

From bbcf4b45dccb5ef7c2c8b516feb89505696ff602 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 12 Sep 2023 20:44:27 -0700
Subject: [PATCH 06/34] Upgrade doxygen to 1.9.8 (#17525)

---
 .github/workflows/publish-c-apidocs.yml |   6 +-
 docs/c_cxx/Doxyfile                     | 414 ++++++++++++++++++------
 2 files changed, 325 insertions(+), 95 deletions(-)

diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml
index 2fbd8e521aeee..73e8194bf7a8d 100644
--- a/.github/workflows/publish-c-apidocs.yml
+++ b/.github/workflows/publish-c-apidocs.yml
@@ -30,13 +30,13 @@ jobs:
           sudo apt update
           sudo apt-get install libclang-dev
           sudo apt-get install libclang-cpp14
-          wget https://www.doxygen.nl/files/doxygen-1.9.6.linux.bin.tar.gz
-          tar xvzf doxygen-1.9.6.linux.bin.tar.gz
+          wget https://www.doxygen.nl/files/doxygen-1.9.8.linux.bin.tar.gz
+          tar xvzf doxygen-1.9.8.linux.bin.tar.gz
       - name: Run doxygen
         run: |
           mkdir -p build/doxygen
           cd docs/c_cxx
-          ../../doxygen-1.9.6/bin/doxygen
+          ../../doxygen-1.9.8/bin/doxygen
       - name: Log source commit
         run: git rev-parse --short HEAD > build/doxygen/html/source-version.txt
       - name: Move C/C++ docs into site
diff --git a/docs/c_cxx/Doxyfile b/docs/c_cxx/Doxyfile
index 94b39d2045f69..aedb1fdcfee75 100644
--- a/docs/c_cxx/Doxyfile
+++ b/docs/c_cxx/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.9.2
+# Doxyfile 1.9.8
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -12,6 +12,16 @@
 # For lists, items can also be appended using:
 # TAG += value [value, ...]
 # Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
 
 #---------------------------------------------------------------------------
 # Project related configuration options
@@ -60,16 +70,28 @@ PROJECT_LOGO           = "../images/ONNX_Runtime_logo - Docs.png"
 
 OUTPUT_DIRECTORY       = ../../build/doxygen
 
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
 # putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
 # The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
 # characters to appear in the names of generated files. If set to NO, non-ASCII
 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
@@ -81,14 +103,14 @@ ALLOW_UNICODE_NAMES    = NO
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
 # The default value is: English.
 
 OUTPUT_LANGUAGE        = English
@@ -341,6 +363,17 @@ MARKDOWN_SUPPORT       = YES
 
 TOC_INCLUDE_HEADINGS   = 5
 
+# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to
+# generate identifiers for the Markdown headings. Note: Every identifier is
+# unique.
+# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a
+# sequence number starting at 0 and GITHUB use the lower case version of title
+# with any whitespace replaced by '-' and punctuation characters removed.
+# The default value is: DOXYGEN.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+MARKDOWN_ID_STYLE      = DOXYGEN
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -437,7 +470,7 @@ INLINE_SIMPLE_STRUCTS  = NO
 # types are typedef'ed and only the typedef is referenced, never the tag name.
 # The default value is: NO.
 
-TYPEDEF_HIDES_STRUCT   = YES
+TYPEDEF_HIDES_STRUCT   = NO
 
 # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
 # cache is used to resolve symbols given their name and scope. Since this can be
@@ -452,7 +485,7 @@ TYPEDEF_HIDES_STRUCT   = YES
 
 LOOKUP_CACHE_SIZE      = 0
 
-# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
 # during processing. When set to 0 doxygen will based this on the number of
 # cores available in the system. You can set it explicitly to a value larger
 # than 0 to get more control over the balance between CPU load and processing
@@ -465,6 +498,14 @@ LOOKUP_CACHE_SIZE      = 0
 
 NUM_PROC_THREADS       = 1
 
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = NO
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -546,7 +587,8 @@ HIDE_UNDOC_MEMBERS     = NO
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
 # to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
@@ -577,14 +619,15 @@ INTERNAL_DOCS          = NO
 # filesystem is case sensitive (i.e. it supports files in the same directory
 # whose names only differ in casing), the option must be set to YES to properly
 # deal with such files in case they appear in the input. For filesystems that
-# are not case sensitive the option should be be set to NO to properly deal with
+# are not case sensitive the option should be set to NO to properly deal with
 # output files written for symbols that only differ in casing, such as for two
 # classes, one named CLASS and the other named Class, and to also support
 # references to files without having to specify the exact matching casing. On
 # Windows (including Cygwin) and MacOS, users should typically set this option
 # to NO, whereas on Linux or other Unix flavors it should typically be set to
 # YES.
-# The default value is: system dependent.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
 
 CASE_SENSE_NAMES       = NO
 
@@ -836,11 +879,26 @@ WARN_IF_INCOMPLETE_DOC = YES
 
 WARN_NO_PARAMDOC       = YES
 
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
 # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
 # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
 # at the end of the doxygen process doxygen will return with a non-zero status.
-# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves
+# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not
+# write the warning messages in between other messages but write them at the end
+# of a run, in case a WARN_LOGFILE is defined the warning messages will be
+# besides being in the defined file also be shown at the end of a run, unless
+# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case
+# the behavior will remain as with the setting FAIL_ON_WARNINGS.
+# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT.
 # The default value is: NO.
 
 WARN_AS_ERROR          = YES
@@ -851,13 +909,27 @@ WARN_AS_ERROR          = YES
 # and the warning text. Optionally the format may contain $version, which will
 # be replaced by the version of the file (if it could be obtained via
 # FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
 # The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -881,10 +953,21 @@ INPUT                  = ../../include/onnxruntime/core/session/onnxruntime_c_ap
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
 # documentation (see:
 # https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
 # *.h) to filter out the source-files in the directories.
@@ -896,18 +979,21 @@ INPUT_ENCODING         = UTF-8
 # Note the list of default checked file patterns might differ from the list of
 # default file extension mappings.
 #
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
-# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
-# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
-# *.vhdl, *.ucf, *.qsf and *.ice.
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm,
+# *.cpp, *.cppm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl,
+# *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, *.php,
+# *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be
+# provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.cc \
                          *.cxx \
+                         *.cxxm \
                          *.cpp \
+                         *.cppm \
                          *.c++ \
+                         *.c++m \
                          *.java \
                          *.ii \
                          *.ixx \
@@ -922,6 +1008,8 @@ FILE_PATTERNS          = *.c \
                          *.hxx \
                          *.hpp \
                          *.h++ \
+                         *.ixx \
+                         *.l \
                          *.cs \
                          *.d \
                          *.php \
@@ -984,10 +1072,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
+# ANamespace::AClass, ANamespace::*Test
 
 EXCLUDE_SYMBOLS        =
 
@@ -1032,6 +1117,11 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 #
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
 # Note that for custom extensions or not directly supported extensions you also
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.
@@ -1073,6 +1163,15 @@ FILTER_SOURCE_PATTERNS =
 
 USE_MDFILE_AS_MAINPAGE =
 
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
+
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
 #---------------------------------------------------------------------------
@@ -1210,10 +1309,11 @@ CLANG_DATABASE_PATH    =
 
 ALPHABETICAL_INDEX     = YES
 
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
@@ -1292,7 +1392,12 @@ HTML_STYLESHEET        =
 # Doxygen will copy the style sheet files to the output directory.
 # Note: The order of the extra style sheet files is of importance (e.g. the last
 # style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_EXTRA_STYLESHEET  =
@@ -1307,6 +1412,19 @@ HTML_EXTRA_STYLESHEET  =
 
 HTML_EXTRA_FILES       =
 
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = AUTO_LIGHT
+
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a color-wheel, see
@@ -1337,15 +1455,6 @@ HTML_COLORSTYLE_SAT    = 100
 
 HTML_COLORSTYLE_GAMMA  = 80
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
 # are dynamically created via JavaScript. If disabled, the navigation index will
@@ -1365,6 +1474,13 @@ HTML_DYNAMIC_MENUS     = YES
 
 HTML_DYNAMIC_SECTIONS  = NO
 
+# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be
+# dynamically folded and expanded in the generated HTML source code.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_CODE_FOLDING      = YES
+
 # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
 # shown in the various tree structured indices initially; the user can expand
 # and collapse entries dynamically later on. Doxygen will expand the tree to
@@ -1401,6 +1517,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1488,6 +1611,16 @@ BINARY_TOC             = NO
 
 TOC_EXPAND             = NO
 
+# The SITEMAP_URL tag is used to specify the full URL of the place where the
+# generated documentation will be placed on the server by the user during the
+# deployment of the documentation. The generated sitemap is called sitemap.xml
+# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL
+# is specified no sitemap is generated. For information about the sitemap
+# protocol see https://www.sitemaps.org
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SITEMAP_URL            =
+
 # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
 # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
 # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
@@ -1605,7 +1738,7 @@ GENERATE_TREEVIEW      = YES
 # area (value NO) or if it should extend to the full height of the window (value
 # YES). Setting this to YES gives a layout similar to
 # https://docs.readthedocs.io with more room for contents, but less room for the
-# project logo, title, and description. If either GENERATOR_TREEVIEW or
+# project logo, title, and description. If either GENERATE_TREEVIEW or
 # DISABLE_INDEX is set to NO, this option has no effect.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1636,6 +1769,13 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
 # If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
 # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
 # https://inkscape.org) to generate formulas as SVG images instead of PNGs for
@@ -1969,9 +2109,16 @@ PDF_HYPERLINKS         = YES
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help.
+# The LATEX_BATCHMODE tag signals the behavior of LaTeX in case of an error.
+# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch
+# mode nothing is printed on the terminal, errors are scrolled as if <return> is
+# hit at every error; missing files that TeX tries to input or request from
+# keyboard input (\read on a not open input stream) cause the job to abort,
+# NON_STOP In nonstop mode the diagnostic message will appear on the terminal,
+# but there is no possibility of user interaction just like in batch mode,
+# SCROLL In scroll mode, TeX will stop only for missing files to input or if
+# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at
+# each error, asking for user intervention.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1992,14 +2139,6 @@ LATEX_HIDE_INDICES     = NO
 
 LATEX_BIB_STYLE        = plain
 
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
 # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
 # path from which the emoji images will be read. If a relative path is entered,
 # it will be relative to the LATEX_OUTPUT directory. If left blank the
@@ -2165,7 +2304,7 @@ DOCBOOK_OUTPUT         = docbook
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures
 # the structure of the code including all documentation. Note that this feature
 # is still experimental and incomplete at the moment.
 # The default value is: NO.
@@ -2176,6 +2315,28 @@ GENERATE_AUTOGEN_DEF   = NO
 # Configuration options related to Sqlite3 output
 #---------------------------------------------------------------------------
 
+# If the GENERATE_SQLITE3 tag is set to YES doxygen will generate a Sqlite3
+# database with symbols found by doxygen stored in tables.
+# The default value is: NO.
+
+GENERATE_SQLITE3       = NO
+
+# The SQLITE3_OUTPUT tag is used to specify where the Sqlite3 database will be
+# put. If a relative path is entered the value of OUTPUT_DIRECTORY will be put
+# in front of it.
+# The default directory is: sqlite3.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_OUTPUT         = sqlite3
+
+# The SQLITE3_OVERWRITE_DB tag is set to YES, the existing doxygen_sqlite3.db
+# database file will be recreated with each doxygen run. If set to NO, doxygen
+# will warn if an a database file is already found and not modify it.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_RECREATE_DB    = YES
+
 #---------------------------------------------------------------------------
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
@@ -2250,7 +2411,8 @@ SEARCH_INCLUDES        = YES
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
 # contain include files that are not input files but should be processed by the
-# preprocessor.
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 # onnxruntime-training and onnxruntime core headers are in different directories.
@@ -2324,15 +2486,15 @@ TAGFILES               =
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
+# If the ALLEXTERNALS tag is set to YES, all external classes and namespaces
+# will be listed in the class and namespace index. If set to NO, only the
+# inherited external classes will be listed.
 # The default value is: NO.
 
 ALLEXTERNALS           = NO
 
 # If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
+# in the topic index. If set to NO, only the current project's groups will be
 # listed.
 # The default value is: YES.
 
@@ -2346,16 +2508,9 @@ EXTERNAL_GROUPS        = YES
 EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2364,7 +2519,7 @@ HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
 # available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
 # The default value is: NO.
@@ -2381,32 +2536,73 @@ HAVE_DOT               = NO
 
 DOT_NUM_THREADS        = 0
 
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
+
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
+
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will
+# generate a graph for each documented class showing the direct and indirect
+# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and
+# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case
+# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the
+# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used.
+# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance
+# relations will be shown as texts / links.
+# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
 # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
 # graph for each documented class showing the direct and indirect implementation
 # dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
+# class with other documented classes. Explicit enabling a collaboration graph,
+# when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the
+# command \collaborationgraph. Disabling a collaboration graph can be
+# accomplished by means of the command \hidecollaborationgraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 COLLABORATION_GRAPH    = YES
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
+# groups, showing the direct groups dependencies. Explicit enabling a group
+# dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means
+# of the command \groupgraph. Disabling a directory graph can be accomplished by
+# means of the command \hidegroupgraph. See also the chapter Grouping in the
+# manual.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2466,7 +2662,9 @@ TEMPLATE_RELATIONS     = NO
 # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
 # YES then doxygen will generate a graph for each documented file showing the
 # direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO,
+# can be accomplished by means of the command \includegraph. Disabling an
+# include graph can be accomplished by means of the command \hideincludegraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2475,7 +2673,10 @@ INCLUDE_GRAPH          = YES
 # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
 # set to YES then doxygen will generate a graph for each documented file showing
 # the direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set
+# to NO, can be accomplished by means of the command \includedbygraph. Disabling
+# an included by graph can be accomplished by means of the command
+# \hideincludedbygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2515,16 +2716,26 @@ GRAPHICAL_HIERARCHY    = YES
 # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
 # dependencies a directory has on other directories in a graphical way. The
 # dependency relations are determined by the #include relations between the
-# files in the directories.
+# files in the directories. Explicit enabling a directory graph, when
+# DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command
+# \directorygraph. Disabling a directory graph can be accomplished by means of
+# the command \hidedirectorygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
+# https://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
@@ -2561,11 +2772,12 @@ DOT_PATH               =
 
 DOTFILE_DIRS           =
 
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCFILE_DIRS           =
+DIA_PATH               =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
@@ -2574,10 +2786,10 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
@@ -2627,6 +2839,8 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2640,3 +2854,19 @@ GENERATE_LEGEND        = YES
 # The default value is: YES.
 
 DOT_CLEANUP            = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will
+# use a built-in version of mscgen tool to produce the charts. Alternatively,
+# the MSCGEN_TOOL tag can also specify the name an external tool. For instance,
+# specifying prog as the value, doxygen will call the tool as prog -T
+# <outfile_format> -o <outputfile> <inputfile>. The external tool should support
+# output file formats "png", "eps", "svg", and "ismap".
+
+MSCGEN_TOOL            =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =

From b52127d22d008254f20de1956e698b8a6f8712d6 Mon Sep 17 00:00:00 2001
From: rui-ren <ruiren1225@gmail.com>
Date: Tue, 12 Sep 2023 22:32:20 -0700
Subject: [PATCH 07/34] update acpt image for the training ci nightly (#17521)

### Description
<!-- Describe your changes. -->

The name of nightly ACPT image has been updated to
`ptebic.azurecr.io/internal/aifx/acpt/nightly-ubuntu-cuda-torch-dev`

As the previous image alias had `cu118`, `torch210dev` or `py38`, any
version update will break the training nightly pipeline



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Using constant image alias to avoid pipeline failure.
---
 .../orttraining-linux-nightly-ortmodule-test-pipeline.yml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
index a41ca5f02467d..7824bf2203efe 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
@@ -23,7 +23,7 @@ jobs:
         --rm \
         --volume $(Build.SourcesDirectory)/orttraining/orttraining/test/python:/onnxruntime_src \
         --volume $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly:/requirements_torch_nightly \
-        ptebic.azurecr.io/internal/azureml/aifx/nightly-ubuntu2004-cu118-py38-torch210dev \
+        ptebic.azurecr.io/internal/aifx/acpt/nightly-ubuntu-cuda-torch-dev \
          bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
     displayName: 'Run ORTModule Tests'
     condition: succeededOrFailed()

From 24a3c740c06f3c3b2d97f2839ece55bbd8ea0dd9 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 12 Sep 2023 22:39:31 -0700
Subject: [PATCH 08/34] Revert "[ROCm][MIGraphX] for googletest dep, set
 OVERRIDE_FIND_PACKAGE (#16715)" (#17523)

This reverts commit bb136f86c8a1d0bcbdc2a77cb16f1c26c9ebd817, then
re-implement it in a different way.
I reverted the original change, then added a version constraint to the
find_package args.

If you still found it picks up wrong gtest version after this change,
you may disable `find_package` by setting
'FETCHCONTENT_TRY_FIND_PACKAGE_MODE' to NEVER. For example, the latest
gtest version is 1.14.0. If at a later time Google releases a new
version of gtest and that one is incompatible with the ONNX Runtime
source code you get today and your dev environment already installed the
new version and you do not want to create a new clean build environment
that is without the package, you can add `--cmake_extra_defines
FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER` to your build command to solve
the problem.
---
 cmake/external/onnxruntime_external_deps.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 2a100ac161b97..e1671bcf43ed9 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -43,8 +43,8 @@ if (onnxruntime_BUILD_UNIT_TESTS)
   FetchContent_Declare(
     googletest
     URL ${DEP_URL_googletest}
+    FIND_PACKAGE_ARGS 1.14.0...<2.0.0 NAMES GTest
     URL_HASH SHA1=${DEP_SHA1_googletest}
-    OVERRIDE_FIND_PACKAGE
   )
 endif()
 

From 41584b28278a25f0e6a269a47264bccad9888f87 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 12 Sep 2023 23:52:08 -0700
Subject: [PATCH 09/34] [js/web] ensure ORT initialization to run only once
 (#17529)

### Description
ensure ORT initialization to run only once
---
 js/web/lib/wasm/session-handler.ts | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/session-handler.ts b/js/web/lib/wasm/session-handler.ts
index d35f295592685..d8c5ae7886fe4 100644
--- a/js/web/lib/wasm/session-handler.ts
+++ b/js/web/lib/wasm/session-handler.ts
@@ -9,6 +9,7 @@ import {SerializableModeldata} from './proxy-messages';
 import {createSession, createSessionAllocate, createSessionFinalize, endProfiling, initializeRuntime, releaseSession, run} from './proxy-wrapper';
 
 let runtimeInitialized: boolean;
+let runtimeInitializationPromise: Promise<void>|undefined;
 
 export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
   private sessionId: number;
@@ -29,7 +30,11 @@ export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
 
   async loadModel(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions): Promise<void> {
     if (!runtimeInitialized) {
-      await initializeRuntime(env);
+      if (!runtimeInitializationPromise) {
+        runtimeInitializationPromise = initializeRuntime(env);
+      }
+      await runtimeInitializationPromise;
+      runtimeInitializationPromise = undefined;
       runtimeInitialized = true;
     }
 

From ec94b07f0a14fc0a805e767803d483059cbd58ff Mon Sep 17 00:00:00 2001
From: xhcao <xinghua.cao@intel.com>
Date: Wed, 13 Sep 2023 15:05:00 +0800
Subject: [PATCH 10/34] [JS/WebGPU] support Concat.int32 operator (#17003)

### Description
<!-- Describe your changes. -->



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/test/data/ops/concat_int32.jsonc       | 406 ++++++++++++++++++
 js/web/test/suite-test-list.jsonc             |  27 +-
 .../core/providers/js/operators/concat.cc     |  12 +-
 3 files changed, 428 insertions(+), 17 deletions(-)
 create mode 100644 js/web/test/data/ops/concat_int32.jsonc

diff --git a/js/web/test/data/ops/concat_int32.jsonc b/js/web/test/data/ops/concat_int32.jsonc
new file mode 100644
index 0000000000000..6e2ce18c6f7c5
--- /dev/null
+++ b/js/web/test/data/ops/concat_int32.jsonc
@@ -0,0 +1,406 @@
+[
+  {
+    "name": "Concat 2D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[4,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+              16
+            ],
+            "dims": [8, 4],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6],
+            "dims": [4, 3],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 2D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[4,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16, 13, 14, 15,
+              16
+            ],
+            "dims": [4, 8],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 5, 6, 1, 2, 3, 4, 3, 4, 7, 8, 5, 6, 7, 8],
+            "dims": [2, 8],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6],
+            "dims": [2, 6],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15,
+              16
+            ],
+            "dims": [4, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 9, 10, 13, 14, 11, 12, 15,
+              16
+            ],
+            "dims": [2, 4, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=2",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 2, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 1, 2, 5, 6, 3, 4, 7, 8, 3, 4, 7, 8, 9, 10, 13, 14, 9, 10, 13, 14, 11, 12, 15, 16, 11, 12, 15,
+              16
+            ],
+            "dims": [2, 2, 8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26,
+              29, 30, 27, 28, 31, 32
+            ],
+            "dims": [4, 2, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15,
+              16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27, 28, 31, 32, 17, 18, 21, 22, 19, 20, 23, 24, 25,
+              26, 29, 30, 27, 28, 31, 32
+            ],
+            "dims": [2, 4, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=2",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 2, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 9, 10, 13, 14, 11, 12, 15,
+              16, 17, 18, 21, 22, 19, 20, 23, 24, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27, 28, 31, 32, 25,
+              26, 29, 30, 27, 28, 31, 32
+            ],
+            "dims": [2, 2, 4, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=3",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 3, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 1, 2, 5, 6, 3, 4, 7, 8, 3, 4, 7, 8, 9, 10, 13, 14, 9, 10, 13, 14, 11, 12, 15, 16, 11, 12, 15,
+              16, 17, 18, 21, 22, 17, 18, 21, 22, 19, 20, 23, 24, 19, 20, 23, 24, 25, 26, 29, 30, 25, 26, 29, 30, 27,
+              28, 31, 32, 27, 28, 31, 32
+            ],
+            "dims": [2, 2, 2, 8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index f53da708b8f6f..f4249b24101e5 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -432,18 +432,18 @@
       // // "test_compress_1",
       // // "test_compress_default_axis",
       // // "test_compress_negative_axis",
-      // "test_concat_1d_axis_0",
-      // "test_concat_1d_axis_negative_1",
-      // "test_concat_2d_axis_0",
-      // "test_concat_2d_axis_1",
-      // "test_concat_2d_axis_negative_1",
-      // "test_concat_2d_axis_negative_2",
-      // "test_concat_3d_axis_0",
-      // "test_concat_3d_axis_1",
-      // "test_concat_3d_axis_2",
-      // "test_concat_3d_axis_negative_1",
-      // "test_concat_3d_axis_negative_2",
-      // "test_concat_3d_axis_negative_3",
+      "test_concat_1d_axis_0",
+      "test_concat_1d_axis_negative_1",
+      "test_concat_2d_axis_0",
+      "test_concat_2d_axis_1",
+      "test_concat_2d_axis_negative_1",
+      "test_concat_2d_axis_negative_2",
+      "test_concat_3d_axis_0",
+      "test_concat_3d_axis_1",
+      "test_concat_3d_axis_2",
+      "test_concat_3d_axis_negative_1",
+      "test_concat_3d_axis_negative_2",
+      "test_concat_3d_axis_negative_3",
       "test_conv_with_autopad_same",
       "test_conv_with_strides_and_asymmetric_padding",
       "test_conv_with_strides_no_padding",
@@ -1330,7 +1330,8 @@
       //"and.jsonc",
       "asin.jsonc",
       "ceil.jsonc",
-      //"concat.jsonc",
+      "concat.jsonc",
+      "concat_int32.jsonc",
       "cast.jsonc",
       "conv.jsonc",
       "cos.jsonc",
diff --git a/onnxruntime/core/providers/js/operators/concat.cc b/onnxruntime/core/providers/js/operators/concat.cc
index 7d50d78c82851..3a6a7e1cafd7a 100644
--- a/onnxruntime/core/providers/js/operators/concat.cc
+++ b/onnxruntime/core/providers/js/operators/concat.cc
@@ -12,7 +12,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     1, 3,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -21,7 +22,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     4, 10,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -30,7 +32,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     11, 12,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -39,7 +42,8 @@ ONNX_OPERATOR_KERNEL_EX(
     13,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 }  // namespace js

From cdf3e9dba9718b4b461540894905e01357096b7d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 13 Sep 2023 00:07:16 -0700
Subject: [PATCH 11/34] [js] update prepack script to use exact version
 (#17484)

### Description
update prepack script to use exact version.

the prepack script for onnxruntime-node, onnxruntime-web and
onnxruntime-react-native is used to update their referencing version of
dependency "onnxruntime-common".

Previously "~" (tilde symbol) is used. This may cause NPM choose an
older version (if the old version matches the version requirement and
was previously installed already so hit the cache). see also
https://semver.npmjs.com/. [This
build](https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1134671&view=results)
is caused by this issue.
---
 js/node/script/prepack.ts          | 2 +-
 js/react_native/scripts/prepack.ts | 2 +-
 js/web/script/prepack.ts           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/node/script/prepack.ts b/js/node/script/prepack.ts
index be86c5687bec0..4c5941d8dae12 100644
--- a/js/node/script/prepack.ts
+++ b/js/node/script/prepack.ts
@@ -11,7 +11,7 @@ function updatePackageJson() {
   const packageCommon = fs.readJSONSync(commonPackageJsonPath);
   const packageSelf = fs.readJSONSync(selfPackageJsonPath);
   const version = packageCommon.version;
-  packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+  packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
 }
diff --git a/js/react_native/scripts/prepack.ts b/js/react_native/scripts/prepack.ts
index 15ae69722108c..2e43294165a83 100644
--- a/js/react_native/scripts/prepack.ts
+++ b/js/react_native/scripts/prepack.ts
@@ -18,7 +18,7 @@ function updatePackageJson() {
     delete packageSelf.dependencies['onnxruntime-common'];
   } else {
     const version = packageCommon.version;
-    packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+    packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   }
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
diff --git a/js/web/script/prepack.ts b/js/web/script/prepack.ts
index be86c5687bec0..4c5941d8dae12 100644
--- a/js/web/script/prepack.ts
+++ b/js/web/script/prepack.ts
@@ -11,7 +11,7 @@ function updatePackageJson() {
   const packageCommon = fs.readJSONSync(commonPackageJsonPath);
   const packageSelf = fs.readJSONSync(selfPackageJsonPath);
   const version = packageCommon.version;
-  packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+  packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
 }

From c0a4fe777fcc1311bf1379651ca68dfde176d94d Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 13 Sep 2023 15:21:28 +0800
Subject: [PATCH 12/34] Move Linux python test into docker  (#17479)

### Description
supplement of #17417



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../azure-pipelines/linux-ci-pipeline.yml     | 103 ++++++------------
 tools/scripts/python_test.sh                  |  28 +++++
 tools/scripts/symbolic_shape_infer_test.sh    |  13 +++
 3 files changed, 73 insertions(+), 71 deletions(-)
 create mode 100644 tools/scripts/python_test.sh
 create mode 100644 tools/scripts/symbolic_shape_infer_test.sh

diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index eb6b274f87d6b..21bc1c481b3e6 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -141,77 +141,39 @@ stages:
               "
         displayName: 'Dotnet build C# sln and Test'
 
-      - task: CmdLine@2
-        displayName: 'Install python deps'
-        inputs:
-          script: |
-             set -e -x
-             python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml onnx -qq
-             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
-             # Test ORT with the latest ONNX release.
-             sed -i "s/git+http:\/\/github\.com\/onnx\/onnx.*/onnx/" $(Build.BinariesDirectory)/requirements.txt
-             python3 -m pip install -r $(Build.BinariesDirectory)/requirements.txt
-             mkdir $(Build.BinariesDirectory)/requirements_torch_cpu/
-             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
-             python3 -m pip install -r $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
-
-      - task: CmdLine@2
-        displayName: 'Install Release python package'
-        inputs:
-          script: |
-             rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
-             python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl
-
-      - task: PythonScript@0
-        displayName: 'Run Release unit tests'
-        inputs:
-           scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
-           workingDirectory: $(Build.BinariesDirectory)/Release
-           arguments: >-
-              --build_dir $(Build.BinariesDirectory)
-              --cmake_generator Ninja
-              --config Release
-              --test
-              --skip_submodule_sync
-              --build_shared_lib
-              --parallel
-              --build_wheel
-              --enable_onnx_tests
-              --enable_transformers_tool_test
-              --ctest_path ""
-
-      - task: CmdLine@2
-        displayName: 'Install Debug python package'
-        inputs:
-          script: |
-             set -e -x
-             rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11
-             python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
-             python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl
-
-      - task: PythonScript@0
-        displayName: 'Run Debug unit tests'
-        inputs:
-          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
-          workingDirectory: $(Build.BinariesDirectory)/Debug
-          arguments: >-
-              --build_dir $(Build.BinariesDirectory)
-              --cmake_generator Ninja
-              --config Debug
-              --test
-              --skip_submodule_sync
-              --build_shared_lib
-              --parallel
-              --build_wheel
-              --enable_onnx_tests
-              --enable_transformers_tool_test
-              --ctest_path ""
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+              /bin/bash -c "
+                set -ex; \
+                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Release && \
+                /bin/bash /onnxruntime_src/tools/scripts/symbolic_shape_infer_test.sh /build
+              "
+        displayName: 'Run Release tests and symbolic shape infer test'
 
-      - task: PythonScript@0
-        displayName: 'Symbolic shape infer'
-        inputs:
-          scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
-          workingDirectory: $(Build.BinariesDirectory)/Release
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Debug
+        displayName: 'Run Debug tests'
 
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
@@ -221,7 +183,6 @@ stages:
           testRunTitle: 'Unit Test Run'
         condition: succeededOrFailed()
 
-
 - stage: arm64_build
   dependsOn: []
   jobs:
diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh
new file mode 100644
index 0000000000000..bfdd4663feede
--- /dev/null
+++ b/tools/scripts/python_test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -ex
+
+export src_dir=$1
+export build_dir=$2
+export config=$3
+
+# it's for manylinux image
+export PATH=/opt/python/cp38-cp38/bin:$PATH
+
+echo Install Python Deps
+cp $src_dir/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $build_dir/requirements.txt
+
+python3 -m pip install -r $build_dir/requirements.txt
+mkdir -p $build_dir/requirements_torch_cpu/
+cp $src_dir/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $build_dir/requirements_torch_cpu/requirements.txt
+python3 -m pip install -r $build_dir/requirements_torch_cpu/requirements.txt
+python3 -m pip list | grep onnx
+
+echo Install $config python package
+rm -rf $build_dir/$config/onnxruntime $build_dir/$config/pybind11
+python3 -m pip install $build_dir/$config/dist/*.whl
+
+echo Run $config unit tests
+pushd $build_dir/$config/
+python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_onnx_tests --enable_transformers_tool_test --ctest_path ""
+popd
diff --git a/tools/scripts/symbolic_shape_infer_test.sh b/tools/scripts/symbolic_shape_infer_test.sh
new file mode 100644
index 0000000000000..d8d50c5e3fa91
--- /dev/null
+++ b/tools/scripts/symbolic_shape_infer_test.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -ex
+
+export build_dir=$1
+
+# it's for manylinux image
+export PATH=/opt/python/cp38-cp38/bin:$PATH
+
+echo Run symbolic shape infer test
+pushd $build_dir/Release/
+python3 /build/Release/onnxruntime_test_python_symbolic_shape_infer.py
+popd

From 54a092c42714caa0d236e0f67b07b0804dbb9490 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 13 Sep 2023 07:26:35 -0700
Subject: [PATCH 13/34] [DML EP] Complete python IO binding implementation
 (#17344)

@fdwr This is the part 2 of the pybind work that was started earlier.
This adds the following features to the python IO binding
implementation:

- Use a bucketized allocator in order to reduce the number of resource
allocations
- Implement the following functions: `ortvalue_from_numpy`,
`update_inplace`, `ortvalue_from_shape_and_type` and `numpy`
- Modify the `onnxruntime_test_python_iobinding` tests to also run on
DML

---------

Co-authored-by: Jeff Bloomfield <jeffbloo@microsoft.com>
---
 .../src/BucketizedBufferAllocator.cpp         |   9 -
 .../src/BucketizedBufferAllocator.h           |   8 +-
 .../providers/dml/dml_provider_factory.cc     |  39 +-
 .../dml/dml_provider_factory_creator.h        |   1 +
 .../python/onnxruntime_pybind_mlvalue.cc      | 107 +++-
 .../python/onnxruntime_pybind_mlvalue.h       |   6 +
 .../python/onnxruntime_pybind_ortvalue.cc     |  31 +-
 .../python/onnxruntime_pybind_state.cc        |   4 +
 .../onnxruntime_test_python_iobinding.py      | 455 ++++++++++--------
 tools/ci_build/build.py                       |   7 +-
 10 files changed, 413 insertions(+), 254 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 5dbea41901b80..c24257071eda5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -212,15 +212,6 @@ namespace Dml
             ORT_THROW_HR(E_INVALIDARG);
         }
         const auto* allocInfo = static_cast<const AllocationInfo*>(opaqueHandle);
-
-        auto owner = allocInfo->GetOwner();
-        //The owner can be null if the resource was wrapped via CreateGPUAllocationFromD3DResource
-        if (owner != nullptr && owner != this)
-        {
-            // This allocation doesn't belong to this allocator!
-            ORT_THROW_HR(E_INVALIDARG);
-        }
-
         return allocInfo;
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 4c24cb174f6ed..196fba5d7689d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -83,16 +83,16 @@ namespace Dml
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
-        
-        // Unless specifically requested, allocation sizes are not rounded to enable pooling 
-        // until SetDefaultRoundingMode is called.  This should be done at completion of session 
+
+        // Unless specifically requested, allocation sizes are not rounded to enable pooling
+        // until SetDefaultRoundingMode is called.  This should be done at completion of session
         // initialization.
         AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Disabled;
 
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<DmlSubAllocator> m_subAllocator;
 
-    #if _DEBUG
+    #ifndef NDEBUG
         // Useful for debugging; keeps track of all allocations that haven't been freed yet
         std::map<size_t, AllocationInfo*> m_outstandingAllocationsById;
     #endif
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index a46f820c6207f..fde61e73c2124 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -128,21 +128,13 @@ Microsoft::WRL::ComPtr<ID3D12Device> DMLProviderFactoryCreator::CreateD3D12Devic
   return d3d12_device;
 }
 
-std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id, bool skip_software_device_check) {
-  ComPtr<ID3D12Device> d3d12_device = CreateD3D12Device(device_id, skip_software_device_check);
-
-  D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
-  cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
-  cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
-
-  ComPtr<ID3D12CommandQueue> cmd_queue;
-  ORT_THROW_IF_FAILED(d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_GRAPHICS_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
-
+Microsoft::WRL::ComPtr<IDMLDevice> DMLProviderFactoryCreator::CreateDMLDevice(ID3D12Device* d3d12_device)
+{
   DML_CREATE_DEVICE_FLAGS flags = DML_CREATE_DEVICE_FLAG_NONE;
 
   // In debug builds, enable the DML debug layer if the D3D12 debug layer is also enabled
 #if _DEBUG && !_GAMING_XBOX
-  ComPtr<ID3D12DebugDevice> debug_device;
+  Microsoft::WRL::ComPtr<ID3D12DebugDevice> debug_device;
   (void)d3d12_device->QueryInterface(IID_PPV_ARGS(&debug_device));  // ignore failure
   const bool is_d3d12_debug_layer_enabled = (debug_device != nullptr);
 
@@ -151,12 +143,27 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int
   }
 #endif
 
-  ComPtr<IDMLDevice> dml_device;
-  ORT_THROW_IF_FAILED(DMLCreateDevice1(d3d12_device.Get(),
-                                   flags,
-                                   DML_FEATURE_LEVEL_5_0,
-                                   IID_PPV_ARGS(&dml_device)));
+  Microsoft::WRL::ComPtr<IDMLDevice> dml_device;
+  ORT_THROW_IF_FAILED(DMLCreateDevice1(
+      d3d12_device,
+      flags,
+      DML_FEATURE_LEVEL_5_0,
+      IID_PPV_ARGS(&dml_device)));
+
+  return dml_device;
+}
+
+std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id, bool skip_software_device_check) {
+  ComPtr<ID3D12Device> d3d12_device = CreateD3D12Device(device_id, skip_software_device_check);
+
+  D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
+  cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+  cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
+
+  ComPtr<ID3D12CommandQueue> cmd_queue;
+  ORT_THROW_IF_FAILED(d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_GRAPHICS_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
 
+  auto dml_device = CreateDMLDevice(d3d12_device.Get());
   return CreateExecutionProviderFactory_DML(dml_device.Get(), cmd_queue.Get());
 }
 
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
index b1c9bb3f6f679..574f4410fe3e3 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
+++ b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
@@ -16,5 +16,6 @@ struct DMLProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(int device_id);
   static std::shared_ptr<IExecutionProviderFactory> Create(int device_id, bool skip_software_device_check);
   static Microsoft::WRL::ComPtr<ID3D12Device> CreateD3D12Device(int device_id, bool skip_software_device_check);
+  static Microsoft::WRL::ComPtr<IDMLDevice> CreateDMLDevice(ID3D12Device* d3d12_device);
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 10c8a2de7c3df..f470e9f6b6ed1 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -26,7 +26,18 @@
 #include "core/framework/provider_options_utils.h"
 
 #ifdef USE_DML
-#include "core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h"
+using Microsoft::WRL::ComPtr;
+
+#include <wil/wrl.h>
+#include "core/providers/dml/DmlExecutionProvider/src/External/D3DX12/d3dx12.h"
+#include "core/providers/dml/DmlExecutionProvider/src/ErrorHandling.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h"
+#include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h"
+#include "core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h"
+#include "core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.h"
+#include "core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h"
+#include "core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h"
 #endif
 
 namespace onnxruntime {
@@ -186,6 +197,11 @@ std::unique_ptr<IDataTransfer> GetGPUDataTransfer() {
 #endif
 
 #ifdef USE_DML
+
+constexpr GUID execution_context_guid = {0x50fd773b, 0x4462, 0x4b28, {0x98, 0x9e, 0x8c, 0xa0, 0x54, 0x05, 0xbd, 0x4a}};
+constexpr GUID upload_heap_guid = {0x125235f9, 0xef41, 0x4043, {0xa4, 0x9d, 0xdd, 0xc9, 0x61, 0xe7, 0xdb, 0xee}};
+constexpr GUID dml_readback_heap_guid = {0x00d32df8, 0xea2d, 0x40bf, {0xa4, 0x47, 0x9c, 0xb4, 0xbc, 0xf1, 0x1d, 0x5e}};
+
 AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
   // Current approach is not thread-safe, but there are some bigger infra pieces to put together in order to make
   // multi-threaded DML allocation work, including maintaining a per-thread DML allocator.
@@ -196,13 +212,100 @@ AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
 
   auto hit = id_to_allocator_map->find(id);
   if (hit == id_to_allocator_map->end()) {
-    auto dml_allocator = std::make_shared<Dml::DmlExternalBufferAllocator>(id);
+    constexpr uint32_t device_id = 0;
+    auto d3d12_device = onnxruntime::DMLProviderFactoryCreator::CreateD3D12Device(device_id, false);
+    auto dml_device = onnxruntime::DMLProviderFactoryCreator::CreateDMLDevice(d3d12_device.Get());
+
+    D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
+    cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+    cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
+
+    ComPtr<ID3D12CommandQueue> cmd_queue;
+    ORT_THROW_IF_FAILED(
+        d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
+
+    auto context = std::make_shared<Dml::ExecutionContext>(d3d12_device.Get(), dml_device.Get(), cmd_queue.Get());
+
+    // We leak the upload and readback heaps to keep them alive, just like the map
+    auto upload_heap = std::make_unique<Dml::PooledUploadHeap>(d3d12_device.Get(), context).release();
+    auto readback_heap = std::make_unique<Dml::ReadbackHeap>(d3d12_device.Get(), context).release();
+
+    auto dml_allocator = std::make_shared<Dml::BucketizedBufferAllocator>(
+        d3d12_device.Get(),
+        context,
+        CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+        D3D12_HEAP_FLAG_NONE,
+        D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
+        D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+        std::make_unique<Dml::DmlCommittedResourceAllocator>(d3d12_device.Get()));
+    dml_allocator->SetDefaultRoundingMode(AllocatorRoundingMode::Enabled);
+    context->SetAllocator(dml_allocator);
+
+    auto context_ptr = context.get();
+
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(execution_context_guid, sizeof(context_ptr), &context_ptr));
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(upload_heap_guid, sizeof(upload_heap), &upload_heap));
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(dml_readback_heap_guid, sizeof(readback_heap), &readback_heap));
+
     hit = id_to_allocator_map->emplace(id, std::move(dml_allocator)).first;
   }
 
   return hit->second;
 }
 
+void CpuToDmlMemCpy(void* dst, const void* src, size_t num_bytes) {
+  const auto* allocInfo = static_cast<const Dml::AllocationInfo*>(dst);
+  ID3D12Resource* dst_data = allocInfo->GetResource();
+
+  ComPtr<ID3D12Device> d3d12_device;
+  ORT_THROW_IF_FAILED(dst_data->GetDevice(IID_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+
+  Dml::ExecutionContext* context = nullptr;
+  uint32_t context_size = gsl::narrow_cast<uint32_t>(sizeof(context));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(execution_context_guid, &context_size, &context));
+
+  Dml::PooledUploadHeap* upload_heap = nullptr;
+  uint32_t upload_heap_size = gsl::narrow_cast<uint32_t>(sizeof(upload_heap));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(upload_heap_guid, &upload_heap_size, &upload_heap));
+
+  upload_heap->BeginUploadToGpu(
+      dst_data, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, gsl::make_span(static_cast<const std::byte*>(src), num_bytes));
+  context->Flush();
+
+  // We don't use the same command queue as the execution provider, so we need to sync to make sure that all data has
+  // been uploaded to the resource. This function is usually called before inference just to upload initial data to the
+  // GPU, so it shouldn't be a bottleneck.
+  context->GetCurrentCompletionEvent().WaitForSignal();
+}
+
+void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
+  const auto* allocInfo = static_cast<const Dml::AllocationInfo*>(src);
+  ID3D12Resource* src_data = allocInfo->GetResource();
+
+  ComPtr<ID3D12Device> d3d12_device;
+  ORT_THROW_IF_FAILED(src_data->GetDevice(IID_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+
+  Dml::ExecutionContext* context = nullptr;
+  uint32_t context_size = gsl::narrow_cast<uint32_t>(sizeof(context));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(execution_context_guid, &context_size, &context));
+
+  Dml::ReadbackHeap* readback_heap = nullptr;
+  uint32_t readback_heap_size = gsl::narrow_cast<uint32_t>(sizeof(readback_heap));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(dml_readback_heap_guid, &readback_heap_size, &readback_heap));
+
+  // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we don't need to sync after
+  // this call
+  readback_heap->ReadbackFromGpu(
+      gsl::make_span(static_cast<std::byte*>(dst), num_bytes), src_data, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+}
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
+      {OrtDevice::GPU, DmlToCpuMemCpy}};
+
+  return &map;
+}
+
 #endif
 
 #ifdef USE_CANN
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index 4ac9c70468b19..e3f277bcb9c41 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -77,6 +77,12 @@ std::unique_ptr<IDataTransfer> GetGPUDataTransfer();
 
 AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id);
 
+void CpuToDmlMemCpy(void* dst, const void* src, size_t num_bytes);
+
+void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction();
+
 #endif
 
 #ifdef USE_CANN
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index f9d908e0ac518..dc4a4dcc13b7f 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -63,7 +63,12 @@ void addOrtValueMethods(pybind11::module& m) {
       // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
       // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in CUDA
       CreateGenericMLValue(nullptr, GetRocmAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToRocmMemCpy);
-
+#elif USE_DML
+      // InputDeflist is null because OrtValue creation is not tied to a specific model
+      // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
+      // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in DML
+      CreateGenericMLValue(
+        nullptr, GetDmlAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToDmlMemCpy);
 #else
       throw std::runtime_error(
           "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
@@ -126,6 +131,12 @@ void addOrtValueMethods(pybind11::module& m) {
             values_type,
             *(ml_value->GetMutable<Tensor>()),
             CpuToRocmMemCpy);
+#elif USE_DML
+          onnxruntime::python::CopyDataToTensor(
+            py_values,
+            values_type,
+            *(ml_value->GetMutable<Tensor>()),
+            CpuToDmlMemCpy);
 #else
         throw std::runtime_error(
             "Unsupported GPU device: Cannot find the supported GPU device.");
@@ -158,12 +169,18 @@ void addOrtValueMethods(pybind11::module& m) {
             throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
           }
           allocator = GetCudaAllocator(device.Id());
-#elif USE_DML
-          allocator = GetDmlAllocator(device.Id());
 #else
       throw std::runtime_error(
           "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
           "Please use the CUDA package of OnnxRuntime to use this feature.");
+#endif
+        } else if (strcmp(GetDeviceName(device), DML) == 0) {
+#if USE_DML
+          allocator = GetDmlAllocator(device.Id());
+#else
+          throw std::runtime_error(
+              "Can't allocate memory on the DirectML device using this package of OnnxRuntime. "
+              "Please use the DirectML package of OnnxRuntime to use this feature.");
 #endif
         } else {
           throw std::runtime_error("Unsupported device: Cannot place the OrtValue on this device");
@@ -290,11 +307,13 @@ void addOrtValueMethods(pybind11::module& m) {
 #ifdef USE_CUDA
         GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCudaToHostMemCpyFunction());
 #elif USE_ROCM
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
 #elif USE_CANN
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
+#elif USE_DML
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetDmlToHostMemCpyFunction());
 #else
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
 #endif
         return obj;
       })
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 82d119894a5d8..907ea0ec41e23 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -237,7 +237,11 @@ const char* GetDeviceName(const OrtDevice& device) {
     case OrtDevice::CPU:
       return CPU;
     case OrtDevice::GPU:
+#ifdef USE_DML
+      return DML;
+#else
       return CUDA;
+#endif
     case OrtDevice::FPGA:
       return "FPGA";
     case OrtDevice::NPU:
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index 8009d97ba34ce..56417f13fbea4 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -16,40 +16,43 @@
 from onnxruntime.capi._pybind_state import OrtValue as C_OrtValue
 from onnxruntime.capi._pybind_state import OrtValueVector, SessionIOBinding
 
+test_params = [
+    ("cuda", "CUDAExecutionProvider", C_OrtDevice.cuda),
+    ("dml", "DmlExecutionProvider", C_OrtDevice.dml),
+]
+
 
 class TestIOBinding(unittest.TestCase):
-    def create_ortvalue_input_on_gpu(self):
+    def _create_ortvalue_input_on_gpu(self, device):
         return onnxrt.OrtValue.ortvalue_from_numpy(
-            np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), "cuda", 0
+            np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), device, 0
         )
 
-    def create_ortvalue_alternate_input_on_gpu(self):
+    def _create_ortvalue_alternate_input_on_gpu(self, device):
         return onnxrt.OrtValue.ortvalue_from_numpy(
             np.array([[2.0, 4.0], [6.0, 8.0], [10.0, 12.0]], dtype=np.float32),
-            "cuda",
+            device,
             0,
         )
 
-    def create_uninitialized_ortvalue_input_on_gpu(self):
-        return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, "cuda", 0)
+    def _create_uninitialized_ortvalue_input_on_gpu(self, device):
+        return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, device, 0)
 
-    def create_numpy_input(self):
+    def _create_numpy_input(self):
         return np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
 
-    def create_expected_output(self):
+    def _create_expected_output(self):
         return np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
 
-    def create_expected_output_alternate(self):
+    def _create_expected_output_alternate(self):
         return np.array([[2.0, 8.0], [18.0, 32.0], [50.0, 72.0]], dtype=np.float32)
 
     def test_bind_input_to_cpu_arr(self):
-        self.create_numpy_input()
-
         session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         io_binding = session.io_binding()
 
         # Bind Numpy object (input) that's on CPU to wherever the model needs it
-        io_binding.bind_cpu_input("X", self.create_numpy_input())
+        io_binding.bind_cpu_input("X", self._create_numpy_input())
 
         # Bind output to CPU
         io_binding.bind_output("Y")
@@ -57,254 +60,280 @@ def test_bind_input_to_cpu_arr(self):
         # Invoke Run
         session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
+        # Sync if different streams
         io_binding.synchronize_outputs()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
+        # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host here)
         ort_output = io_binding.copy_outputs_to_cpu()[0]
 
         # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_output))
 
-    @unittest.skip("Could not find an implementation for Identity(19) node with name ''")
     def test_bind_input_types(self):
-        opset = onnx_opset_version()
-        devices = [
-            (
-                C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0),
-                ["CPUExecutionProvider"],
-            )
-        ]
-        if "CUDAExecutionProvider" in onnxrt.get_all_providers():
-            devices.append(
-                (
-                    C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0),
-                    ["CUDAExecutionProvider"],
-                )
-            )
-
-        for device, provider in devices:
-            for dtype in [
-                np.float32,
-                np.float64,
-                np.int32,
-                np.uint32,
-                np.int64,
-                np.uint64,
-                np.int16,
-                np.uint16,
-                np.int8,
-                np.uint8,
-                np.float16,
-                np.bool_,
-            ]:
-                with self.subTest(dtype=dtype, device=str(device)):
-                    x = np.arange(8).reshape((-1, 2)).astype(dtype)
-                    proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
-
-                    X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
-                    Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
-
-                    # inference
-                    node_add = helper.make_node("Identity", ["X"], ["Y"])
-
-                    # graph
-                    graph_def = helper.make_graph([node_add], "lr", [X], [Y], [])
-                    model_def = helper.make_model(
-                        graph_def,
-                        producer_name="dummy",
-                        ir_version=7,
-                        producer_version="0",
-                        opset_imports=[helper.make_operatorsetid("", opset)],
-                    )
-
-                    sess = onnxrt.InferenceSession(model_def.SerializeToString(), providers=provider)
-
-                    bind = SessionIOBinding(sess._sess)
-                    ort_value = C_OrtValue.ortvalue_from_numpy(x, device)
-                    bind.bind_ortvalue_input("X", ort_value)
-                    bind.bind_output("Y", device)
-                    sess._sess.run_with_iobinding(bind, None)
-                    ortvaluevector = bind.get_outputs()
-                    self.assertIsInstance(ortvaluevector, OrtValueVector)
-                    ortvalue = bind.get_outputs()[0]
-                    y = ortvalue.numpy()
-                    assert_almost_equal(x, y)
-
-                    bind = SessionIOBinding(sess._sess)
-                    bind.bind_input("X", device, dtype, x.shape, ort_value.data_ptr())
-                    bind.bind_output("Y", device)
-                    sess._sess.run_with_iobinding(bind, None)
-                    ortvalue = bind.get_outputs()[0]
-                    y = ortvalue.numpy()
-                    assert_almost_equal(x, y)
+        for device, execution_provider, generate_device in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+
+                opset = onnx_opset_version()
+                devices = [
+                    (
+                        C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0),
+                        ["CPUExecutionProvider"],
+                    ),
+                    (
+                        C_OrtDevice(generate_device(), C_OrtDevice.default_memory(), 0),
+                        [execution_provider],
+                    ),
+                ]
+
+                for inner_device, provider in devices:
+                    for dtype in [
+                        np.float32,
+                        np.float64,
+                        np.int32,
+                        np.uint32,
+                        np.int64,
+                        np.uint64,
+                        np.int16,
+                        np.uint16,
+                        np.int8,
+                        np.uint8,
+                        np.float16,
+                        np.bool_,
+                    ]:
+                        with self.subTest(dtype=dtype, inner_device=str(inner_device)):
+                            x = np.arange(8).reshape((-1, 2)).astype(dtype)
+                            proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
+
+                            X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
+                            Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
+
+                            # inference
+                            node_add = helper.make_node("Identity", ["X"], ["Y"])
+
+                            # graph
+                            graph_def = helper.make_graph([node_add], "lr", [X], [Y], [])
+                            model_def = helper.make_model(
+                                graph_def,
+                                producer_name="dummy",
+                                ir_version=7,
+                                producer_version="0",
+                                opset_imports=[helper.make_operatorsetid("", opset)],
+                            )
+
+                            sess = onnxrt.InferenceSession(model_def.SerializeToString(), providers=provider)
+
+                            bind = SessionIOBinding(sess._sess)
+                            ort_value = C_OrtValue.ortvalue_from_numpy(x, inner_device)
+                            bind.bind_ortvalue_input("X", ort_value)
+                            bind.bind_output("Y", inner_device)
+                            sess._sess.run_with_iobinding(bind, None)
+                            ortvaluevector = bind.get_outputs()
+                            self.assertIsInstance(ortvaluevector, OrtValueVector)
+                            ortvalue = bind.get_outputs()[0]
+                            y = ortvalue.numpy()
+                            assert_almost_equal(x, y)
+
+                            bind = SessionIOBinding(sess._sess)
+                            bind.bind_input("X", inner_device, dtype, x.shape, ort_value.data_ptr())
+                            bind.bind_output("Y", inner_device)
+                            sess._sess.run_with_iobinding(bind, None)
+                            ortvalue = bind.get_outputs()[0]
+                            y = ortvalue.numpy()
+                            assert_almost_equal(x, y)
 
     def test_bind_input_only(self):
-        input = self.create_ortvalue_input_on_gpu()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+                input = self._create_ortvalue_input_on_gpu(device)
 
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Bind input to CUDA
-        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
+                # Bind input to the GPU
+                io_binding.bind_input("X", device, 0, np.float32, [3, 2], input.data_ptr())
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Bind output to CPU
-        io_binding.bind_output("Y")
+                # Bind output to CPU
+                io_binding.bind_output("Y")
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
-        ort_output = io_binding.copy_outputs_to_cpu()[0]
+                # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+                # here)
+                ort_output = io_binding.copy_outputs_to_cpu()[0]
 
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output))
 
     def test_bind_input_and_preallocated_output(self):
-        input = self.create_ortvalue_input_on_gpu()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
 
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
-
-        # Bind input to CUDA
-        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
-
-        # Bind output to CUDA
-        output = self.create_uninitialized_ortvalue_input_on_gpu()
-        io_binding.bind_output("Y", "cuda", 0, np.float32, [3, 2], output.data_ptr())
-
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
-
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                input = self._create_ortvalue_input_on_gpu(device)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
-        ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals))
+                # Bind input to the GPU
+                io_binding.bind_input("X", device, 0, np.float32, [3, 2], input.data_ptr())
 
-        # Validate if ORT actually wrote to pre-allocated buffer by copying the Torch allocated buffer
-        # to the host and validating its contents
-        ort_output_vals_in_cpu = output.numpy()
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals_in_cpu))
+                # Bind output to the GPU
+                output = self._create_uninitialized_ortvalue_input_on_gpu(device)
+                io_binding.bind_output("Y", device, 0, np.float32, [3, 2], output.data_ptr())
 
-    def test_bind_input_and_non_preallocated_output(self):
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Bind input to CUDA
-        io_binding.bind_input(
-            "X",
-            "cuda",
-            0,
-            np.float32,
-            [3, 2],
-            self.create_ortvalue_input_on_gpu().data_ptr(),
-        )
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Bind output to CUDA
-        io_binding.bind_output("Y", "cuda")
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+                # here)
+                ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals))
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Validate if ORT actually wrote to pre-allocated buffer by copying the allocated buffer
+                # to the host and validating its contents
+                ort_output_vals_in_cpu = output.numpy()
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals_in_cpu))
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+    def test_bind_input_and_non_preallocated_output(self):
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
+
+                input = self._create_ortvalue_input_on_gpu(device)
+
+                # Bind input to the GPU
+                io_binding.bind_input(
+                    "X",
+                    device,
+                    0,
+                    np.float32,
+                    [3, 2],
+                    input.data_ptr(),
+                )
 
-        # This call returns an OrtValue which has data allocated by ORT on CUDA
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_outputs[0].numpy()))
-
-        # We should be able to repeat the above process as many times as we want - try once more
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_outputs[0].numpy()))
-
-        # Change the bound input and validate the results in the same bound OrtValue
-        # Bind alternate input to CUDA
-        io_binding.bind_input(
-            "X",
-            "cuda",
-            0,
-            np.float32,
-            [3, 2],
-            self.create_ortvalue_alternate_input_on_gpu().data_ptr(),
-        )
+                # Bind output to the GPU
+                io_binding.bind_output("Y", device)
+
+                # Sync if different streams
+                io_binding.synchronize_inputs()
+
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
+
+                # Sync if different streams
+                io_binding.synchronize_outputs()
+
+                # This call returns an OrtValue which has data allocated by ORT on the GPU
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+                # We should be able to repeat the above process as many times as we want - try once more
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+                input = self._create_ortvalue_alternate_input_on_gpu(device)
+
+                # Change the bound input and validate the results in the same bound OrtValue
+                # Bind alternate input to the GPU
+                io_binding.bind_input(
+                    "X",
+                    device,
+                    0,
+                    np.float32,
+                    [3, 2],
+                    input.data_ptr(),
+                )
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # This call returns an OrtValue which has data allocated by ORT on CUDA
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output_alternate(), ort_outputs[0].numpy()))
+                # This call returns an OrtValue which has data allocated by ORT on the GPU
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output_alternate(), ort_outputs[0].numpy()))
 
     def test_bind_input_and_bind_output_with_ortvalues(self):
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
 
-        # Bind ortvalue as input
-        input_ortvalue = self.create_ortvalue_input_on_gpu()
-        io_binding.bind_ortvalue_input("X", input_ortvalue)
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Bind ortvalue as output
-        output_ortvalue = self.create_uninitialized_ortvalue_input_on_gpu()
-        io_binding.bind_ortvalue_output("Y", output_ortvalue)
+                # Bind ortvalue as input
+                input_ortvalue = self._create_ortvalue_input_on_gpu(device)
+                io_binding.bind_ortvalue_input("X", input_ortvalue)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Bind ortvalue as output
+                output_ortvalue = self._create_uninitialized_ortvalue_input_on_gpu(device)
+                io_binding.bind_ortvalue_output("Y", output_ortvalue)
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Inspect contents of output_ortvalue and make sure that it has the right contents
-        self.assertTrue(np.array_equal(self.create_expected_output(), output_ortvalue.numpy()))
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Bind another ortvalue as input
-        input_ortvalue_2 = self.create_ortvalue_alternate_input_on_gpu()
-        io_binding.bind_ortvalue_input("X", input_ortvalue_2)
+                # Inspect contents of output_ortvalue and make sure that it has the right contents
+                self.assertTrue(np.array_equal(self._create_expected_output(), output_ortvalue.numpy()))
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Bind another ortvalue as input
+                input_ortvalue_2 = self._create_ortvalue_alternate_input_on_gpu(device)
+                io_binding.bind_ortvalue_input("X", input_ortvalue_2)
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
+
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Inspect contents of output_ortvalue and make sure that it has the right contents
-        self.assertTrue(np.array_equal(self.create_expected_output_alternate(), output_ortvalue.numpy()))
+                # Inspect contents of output_ortvalue and make sure that it has the right contents
+                self.assertTrue(np.array_equal(self._create_expected_output_alternate(), output_ortvalue.numpy()))
 
 
 if __name__ == "__main__":
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 48129e15934dc..c4fb5499983cb 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1840,13 +1840,12 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     [sys.executable, "onnxruntime_test_python_symbolic_shape_infer.py"], cwd=cwd, dll_path=dll_path
                 )
 
-            # For CUDA enabled builds test IOBinding feature
-            if args.use_cuda:
-                # We need to have Torch installed to test the IOBinding feature
-                # which currently uses Torch's allocator to allocate GPU memory for testing
+            # For CUDA or DML enabled builds test IOBinding feature
+            if args.use_cuda or args.use_dml:
                 log.info("Testing IOBinding feature")
                 run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
 
+            if args.use_cuda:
                 log.info("Testing CUDA Graph feature")
                 run_subprocess([sys.executable, "onnxruntime_test_python_cudagraph.py"], cwd=cwd, dll_path=dll_path)
 

From 5d3786206bbe8a84b9d3f9c62e310258e3570ff2 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 13 Sep 2023 08:50:14 -0700
Subject: [PATCH 14/34] Fix ROCM's nightly build (#17518)

### Description
PR 15470 updated some C/C++ dependencies. The change caused ROCM EP's
nightly build to fail. see issue
https://github.com/ROCm-Developer-Tools/HIP/issues/2082 for a
background. So, the root cause is HIP compiler has a special requirement
that HIP's include dirs must be used before the operating system's
include folder: /usr/include. HIP adds "-isystem" in front of
"/usr/include". gcc or clang will search the folders added with "-I"
first, then the "-isystem" folder. It works fine as long as we do not
add "-I/usr/include" to the compile commands for *.cu files. It would be wrong if
we already have installed an open source library to /usr and want to use the
prebuilt library from there instead of the current build dir.


### Motivation and Context
---
 tools/ci_build/github/azure-pipelines/templates/rocm.yml | 2 +-
 tools/ci_build/github/linux/build_rocm_c_api_package.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
index fe0f2c3791e72..cc2e8745e8946 100644
--- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
@@ -91,7 +91,7 @@ jobs:
               --enable_training \
               --cmake_extra_defines \
                 CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
-                onnxruntime_BUILD_UNIT_TESTS=OFF \
+                onnxruntime_BUILD_UNIT_TESTS=OFF FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER \
               ${{ variables['EnableProfiling'] }}
       workingDirectory: $(Build.SourcesDirectory)
     displayName: 'Build onnxruntime (in container)'
diff --git a/tools/ci_build/github/linux/build_rocm_c_api_package.sh b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
index 4d0af63893643..957f1f8a812a5 100755
--- a/tools/ci_build/github/linux/build_rocm_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
@@ -40,7 +40,7 @@ docker run --rm \
     --use_rocm --rocm_version=$ROCM_VERSION --rocm_home $ROCM_HOME --nccl_home $ROCM_HOME \
     --build_shared_lib \
     --skip_submodule_sync \
-    --skip_tests \
+    --skip_tests --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER
 
 
 EXIT_CODE=$?

From a2e75114cc9c38ab4c48fac4c884d53f2d0d17d4 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 13 Sep 2023 09:17:34 -0700
Subject: [PATCH 15/34] [js/web] add sessionOptions.freeDimensionOverrides
 (#17488)

### Description
Allows to specify fixed size for dynamic input of a model. resolves
#16707

Pending test
---
 js/common/lib/inference-session.ts    |  7 +++++++
 js/web/lib/wasm/binding/ort-wasm.d.ts |  1 +
 js/web/lib/wasm/session-options.ts    | 15 +++++++++++++++
 onnxruntime/wasm/api.cc               |  6 ++++++
 onnxruntime/wasm/api.h                |  7 +++++++
 5 files changed, 36 insertions(+)

diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index ec030084c9675..71a5912df2464 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -66,6 +66,13 @@ export declare namespace InferenceSession {
      */
     interOpNumThreads?: number;
 
+    /**
+     * The free dimension override.
+     *
+     * This setting is available only in ONNXRuntime (Node.js binding and react-native) or WebAssembly backend
+     */
+    freeDimensionOverrides?: {readonly [dimensionName: string]: number};
+
     /**
      * The optimization level.
      *
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 7f0430b7b28b9..59da1369e152e 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -54,6 +54,7 @@ export interface OrtWasmModule extends EmscriptenModule {
       enableProfiling: boolean, profileFilePrefix: number, logId: number, logSeverityLevel: number,
       logVerbosityLevel: number, optimizedModelFilePath: number): number;
   _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number;
+  _OrtAddFreeDimensionOverride(sessionOptionsHandle: number, name: number, dim: number): number;
   _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number;
   _OrtReleaseSessionOptions(sessionOptionsHandle: number): void;
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 38caa9076e3c0..2659b471733f5 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -143,6 +143,21 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
       setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
     }
 
+    if (sessionOptions.freeDimensionOverrides) {
+      for (const [name, value] of Object.entries(sessionOptions.freeDimensionOverrides)) {
+        if (typeof name !== 'string') {
+          throw new Error(`free dimension override name must be a string: ${name}`);
+        }
+        if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) {
+          throw new Error(`free dimension override value must be a non-negative integer: ${value}`);
+        }
+        const nameOffset = allocWasmString(name, allocs);
+        if (wasm._OrtAddFreeDimensionOverride(sessionOptionsHandle, nameOffset, value) !== 0) {
+          checkLastError(`Can't set a free dimension override: ${name} - ${value}.`);
+        }
+      }
+    }
+
     if (sessionOptions.extra !== undefined) {
       iterateExtraOptions(sessionOptions.extra, '', new WeakSet<Record<string, unknown>>(), (key, value) => {
         const keyDataOffset = allocWasmString(key, allocs);
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 937d505015d3c..174edabbc91fe 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -155,6 +155,12 @@ int OrtAppendExecutionProvider(ort_session_options_handle_t session_options, con
   return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, nullptr, nullptr, 0);
 }
 
+int OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
+                                const char* dim_param_name,
+                                int dim_value) {
+  return CHECK_STATUS(AddFreeDimensionOverrideByName, session_options, dim_param_name, dim_value);
+}
+
 int OrtAddSessionConfigEntry(OrtSessionOptions* session_options,
                              const char* config_key,
                              const char* config_value) {
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index b9103414aae67..398c901e0e5ed 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -84,6 +84,13 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t
 int EMSCRIPTEN_KEEPALIVE OrtAppendExecutionProvider(ort_session_options_handle_t session_options,
                                                     const char* name);
 
+/**
+ * add a free dimension override for one dimension of a session's input.
+ */
+int EMSCRIPTEN_KEEPALIVE OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
+                                                     const char* dim_param_name,
+                                                     int dim_value);
+
 /**
  * store configurations for a session.
  * @param session_options a handle to session options created by OrtCreateSessionOptions

From 4e37c5d1f0c54d43ba2662f490d84cd0333c8813 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 13 Sep 2023 09:22:21 -0700
Subject: [PATCH 16/34] Bump actions/checkout from 3 to 4 (#17487)

---
 .github/workflows/cffconvert.yml                 | 2 +-
 .github/workflows/codeql.yml                     | 2 +-
 .github/workflows/gradle-wrapper-validation.yml  | 2 +-
 .github/workflows/lint.yml                       | 6 +++---
 .github/workflows/linux.yml                      | 2 +-
 .github/workflows/publish-c-apidocs.yml          | 2 +-
 .github/workflows/publish-csharp-apidocs.yml     | 2 +-
 .github/workflows/publish-java-apidocs.yml       | 2 +-
 .github/workflows/publish-js-apidocs.yml         | 2 +-
 .github/workflows/publish-objectivec-apidocs.yml | 2 +-
 .github/workflows/publish-python-apidocs.yml     | 2 +-
 .github/workflows/windows.yml                    | 4 ++--
 12 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index 6851c52d380ec..7144363717749 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out a copy of the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Check whether the citation metadata from CITATION.cff is valid
         uses: citation-file-format/cffconvert-github-action@2.0.0
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 2fe66013ebbbc..d3ecf44fe5733 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -33,7 +33,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 07346b38b2151..03ea773a25130 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -10,5 +10,5 @@ jobs:
     name: "Validation"
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: gradle/wrapper-validation-action@v1
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 91f9a8ee3df40..432c789e943b5 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -12,7 +12,7 @@ jobs:
     name: Optional Lint
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: misspell # Check spellings as well
         uses: reviewdog/action-misspell@v1
         with:
@@ -34,7 +34,7 @@ jobs:
     name: Python format
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
@@ -100,7 +100,7 @@ jobs:
     name: Lint JavaScript
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: reviewdog/action-eslint@v1
         with:
           reporter: github-pr-check
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index dceb15b446a8a..7b314d845d9b4 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -14,7 +14,7 @@ jobs:
   Onnxruntime-TVM:
     runs-on: ubuntu-latest
     steps:
-       - uses: actions/checkout@v3
+       - uses: actions/checkout@v4
          with:
            submodules: true
        - uses: actions/setup-python@v4
diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml
index 73e8194bf7a8d..0a3e9ed2594c1 100644
--- a/.github/workflows/publish-c-apidocs.yml
+++ b/.github/workflows/publish-c-apidocs.yml
@@ -24,7 +24,7 @@ jobs:
     name: Generate C/C++ API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install doxygen and dependencies
         run: |
           sudo apt update
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 097d4a1cdff5e..9b9ca924bd008 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -24,7 +24,7 @@ jobs:
     env:
       DOCFXVERSION: 2.62.2
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup .NET
       uses: actions/setup-dotnet@v3
       with:
diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
index cea350ba54de0..9ea9bda7e7c53 100644
--- a/.github/workflows/publish-java-apidocs.yml
+++ b/.github/workflows/publish-java-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate Java docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up JDK 11
         uses: actions/setup-java@v3
         with:
diff --git a/.github/workflows/publish-js-apidocs.yml b/.github/workflows/publish-js-apidocs.yml
index 5668be77c98a4..ba8bfd718abfa 100644
--- a/.github/workflows/publish-js-apidocs.yml
+++ b/.github/workflows/publish-js-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate JS API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Node.js
         uses: actions/setup-node@v3
         with:
diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml
index b966793cc0d06..1b327eebfa8a8 100644
--- a/.github/workflows/publish-objectivec-apidocs.yml
+++ b/.github/workflows/publish-objectivec-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate Objective-C API docs
     runs-on: macos-13
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Install Jazzy
       run: |
diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index 4ca1249fc1d8e..ab9d4781afb83 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -24,7 +24,7 @@ jobs:
     name: Generate Python API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install tools
         run: |
           sudo apt-get update
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8cd62db77b744..ba24e7eebfb03 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -18,7 +18,7 @@ jobs:
   Windows-CUDA-12:
     runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: false
       - uses: actions/setup-python@v4
@@ -46,7 +46,7 @@ jobs:
   Onnxruntime-TVM:
     runs-on: windows-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
       - uses: actions/setup-python@v4

From 7edff1c2bfca812ce352b8df8efc6c904c4292ca Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Wed, 13 Sep 2023 13:02:58 -0700
Subject: [PATCH 17/34] [DML EP] Add subgraph fusion support (#17504)

### Description
<!-- Describe your changes. -->



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../src/DmlGraphFusionTransformer.cpp         | 37 +++++++++++++-
 .../src/DmlGraphFusionTransformer.h           | 49 +++++++++++--------
 .../src/GraphPartitioner.cpp                  | 18 +++----
 .../src/GraphPartitioner.h                    |  5 +-
 4 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
index a9d19a022d3e7..4813707cdf50c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@@ -38,6 +38,16 @@ namespace Dml
         bool& modified,
         int graph_level,
         const onnxruntime::logging::Logger& logger) const
+    {
+        return ApplyImplHelper(graph, modified, graph_level, logger, {});
+    }
+
+    onnxruntime::common::Status DmlGraphFusionTransformer::ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graph_level,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const
     {
         onnxruntime::ProviderType provider_type = onnxruntime::kDmlExecutionProvider;
         const gsl::not_null<const onnxruntime::KernelRegistry*> registry = m_providerImpl->GetKernelRegistry().get();
@@ -49,6 +59,30 @@ namespace Dml
         std::vector<std::shared_ptr<CompiledPartitionInfo>> compiledPartitionInfos;
         std::vector<onnxruntime::NodeIndex> additionalSplittingNodes;
 
+        onnxruntime::GraphViewer graph_viewer(graph);
+        const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+
+        for (auto node_index : node_topology_list)
+        {
+            auto* node = graph.GetNode(node_index);
+            if (!node)
+            {
+                continue;  // node was removed
+            }
+
+            std::unordered_map<std::string, const onnxruntime::NodeArg*> subgraphImplicitInputDefs;
+            for (const onnxruntime::NodeArg* inputDef : node->ImplicitInputDefs())
+            {
+                subgraphImplicitInputDefs[inputDef->Name()] = inputDef;
+            }
+
+            for (auto& entry : node->GetAttributeNameToMutableSubgraphMap())
+            {
+                auto& subgraph = *entry.second;
+                ORT_RETURN_IF_ERROR(ApplyImplHelper(subgraph, modified, graph_level + 1, logger, subgraphImplicitInputDefs));
+            }
+        }
+
         do
         {
             // Initializers needed by any graph partition
@@ -62,7 +96,8 @@ namespace Dml
                 m_providerImpl->GetSupportedDeviceDataTypeMask(),
                 graphNodePropertyMap,
                 requiredInitializerMap,
-                additionalSplittingNodes);
+                additionalSplittingNodes,
+                implicitInputDefs);
 
             // Reset the splitting nodes for the current iteration
             additionalSplittingNodes.clear();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
index b546f29f59719..19dab0c89943c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
@@ -2,32 +2,41 @@
 // Licensed under the MIT License.
 #pragma once
 
-
+#include <string>
+#include <unordered_map>
 #include "core/optimizer/graph_transformer.h"
 #include "core/framework/execution_providers.h"
 
 namespace Dml
 {
-	class ExecutionProviderImpl;
+class ExecutionProviderImpl;
+
+class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
+{
+public:
+    DmlGraphFusionTransformer(
+        const std::string& name,
+        const onnxruntime::IExecutionProvider* provider
+    );
+
+public:
+    static inline const char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlFusedNode_";
+    static inline const char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlFusedNodeDomain";
 
-	class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
-	{
-	public:
-		DmlGraphFusionTransformer(
-			const std::string& name,
-			const onnxruntime::IExecutionProvider* provider
-		);
+private:
+    onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph,
+                                            bool& modified,
+                                            int graph_level,
+                                            const onnxruntime::logging::Logger& logger) const final;
 
-	public:
-		inline const static char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlFusedNode_";
-		inline const static char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlFusedNodeDomain";
+    onnxruntime::common::Status ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graph_level,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const;
 
-	private:
-		onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph, 
-											  bool& modified, 
-											  int graph_level, 
-											  const onnxruntime::logging::Logger& logger) const final;
-	private:
-		const ExecutionProviderImpl* m_providerImpl = nullptr;
-	};
+private:
+    const ExecutionProviderImpl* m_providerImpl = nullptr;
+};
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
index 2c8d4e4459f7f..18943878ccedc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
@@ -345,13 +345,8 @@ namespace Dml
     // Whether any operator in the model contains a subgraph.  This is true
     // if the graph being partitioned is itself within a subgraph, or contains
     // an operator with a subgraph.
-    bool ModelUsesSubgraph(const onnxruntime::GraphViewer& graph)
+    bool ContainsSubgraph(const onnxruntime::GraphViewer& graph)
     {
-        if (graph.IsSubgraph())
-        {
-            return true;
-        }
-
         const std::vector<onnxruntime::NodeIndex>& toplogicalOrder = graph.GetNodesInTopologicalOrder();
 
         for (size_t nodeIndex : toplogicalOrder)
@@ -384,7 +379,8 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
-        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes)
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs)
     {
         // Nodes are uniquely identified by the name of their first output argument
         std::vector<std::unique_ptr<GraphPartition>> partitions;
@@ -419,7 +415,7 @@ namespace Dml
         }
 
         // Check whether this graph is a subgraph, or contains any node with a subgraph.
-        bool modelUsesSubgraph = ModelUsesSubgraph(graph);
+        bool containsSubgraph = ContainsSubgraph(graph);
 
         uint32_t splittingNodeIndex = 0;
 
@@ -454,10 +450,10 @@ namespace Dml
             // Add a unique partition if graph node usage is not supported.
             //
             // Partitioning is disabled in models with subgraphs to work around issues with implicit inputs.
-            // The partitioning algorithm does not currently consider such inputs.  Transfering shared initializers
+            // The partitioning algorithm does not currently consider such inputs. Transferring shared initializers
             // for partitions could also cause problems.  Note, operators with subgraphs are currently not efficient
             // anyhow due to CPU/GPU copies.
-            if (modelUsesSubgraph || !isDmlGraphNode)
+            if (containsSubgraph || !isDmlGraphNode)
             {
                 partitions.push_back(CreatePartitionAndFinalizeInputs(node, isDmlNode, false, nodeNameToPartitionMap));
                 continue;
@@ -505,7 +501,7 @@ namespace Dml
                             firstNonFinalInputPartition->AddInput(arg->Name());
                         }
 
-                        if (graphInputs.find(arg->Name()) != graphInputs.end())
+                        if (graphInputs.find(arg->Name()) != graphInputs.end() || implicitInputs.find(arg->Name()) != implicitInputs.end())
                         {
                             firstNonFinalInputPartition->AddInput(arg->Name());
                         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
index 990ba00fc4672..37d577f647fb5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
 #include "core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h"
 
 namespace Dml
@@ -48,5 +50,6 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
-        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes);
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs);
 } // namespace Dml

From 03b56f7a73319e95e381a623325207665c6ac037 Mon Sep 17 00:00:00 2001
From: Arthur Islamov <arthur@islamov.ai>
Date: Thu, 14 Sep 2023 00:11:17 +0400
Subject: [PATCH 18/34] [js/webgpu] FP16 extension registration (#17493)

### Description
First small change to support FP16

---------

Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
---
 js/web/lib/wasm/jsep/backend-webgpu.ts         | 8 ++++++--
 js/web/lib/wasm/jsep/webgpu/ops/common.ts      | 9 ++++++---
 js/web/lib/wasm/jsep/webgpu/program-manager.ts | 7 +++++--
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 653957a9a3489..e6e78df2cfb23 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -110,6 +110,7 @@ export class WebGpuBackend {
     }
 
     this.env = env;
+    const requiredFeatures: GPUFeatureName[] = [];
     const deviceDescriptor: GPUDeviceDescriptor = {
       requiredLimits: {
         maxComputeWorkgroupStorageSize: adapter.limits.maxComputeWorkgroupStorageSize,
@@ -121,13 +122,16 @@ export class WebGpuBackend {
         maxComputeWorkgroupSizeY: adapter.limits.maxComputeWorkgroupSizeY,
         maxComputeWorkgroupSizeZ: adapter.limits.maxComputeWorkgroupSizeZ,
       },
+      requiredFeatures,
     };
     // WebGPU Spec: Timestamp Queries Inside Passes
     // https://github.com/gpuweb/gpuweb/blob/main/proposals/timestamp-query-inside-passes.md
     if (adapter.features.has('timestamp-query-inside-passes')) {
       this.supportTimestampQuery = true;
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      deviceDescriptor.requiredFeatures = ['timestamp-query-inside-passes' as any];
+      requiredFeatures.push('timestamp-query-inside-passes' as GPUFeatureName);
+    }
+    if (adapter.features.has('shader-f16')) {
+      requiredFeatures.push('shader-f16');
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index c96f4858db2ae..f3845e3110905 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -192,11 +192,14 @@ export interface IndicesHelper {
 }
 
 const getWgslMappedType = (type: number, components: 1|2|3|4): string|[string, string] => {
+  if (components === 3) {
+    throw new Error('vec3 has same alignment as vec4, use vec4 instead');
+  }
+
   // return type is [ storage type, runtime type ] or a single string for both
   switch (type) {
-    // TODO: enable after "shader-f16" WSGL extension release
-    // case DataType.float16:
-    //   return components > 1 ? `vec${components}<f16>` : 'f16';
+    case DataType.float16:
+      return components > 1 ? `vec${components}<f16>` : 'f16';
     case DataType.float:
       return components > 1 ? `vec${components}<f32>` : 'f32';
     case DataType.int32:
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index a02d2ebeebf78..cce61be3448cd 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -126,10 +126,13 @@ export class ProgramManager {
   }
   build(programInfo: ProgramInfo, normalizedDispatchGroupSize: [number, number, number]): Artifact {
     const device = this.backend.device;
-
+    const extensions: string[] = [];
+    if (device.features.has('shader-f16')) {
+      extensions.push('enable f16;');
+    }
     const shaderHelper = createShaderHelper(normalizedDispatchGroupSize);
     const userCode = programInfo.getShaderSource(shaderHelper);
-    const code = `${shaderHelper.additionalImplementations}\n${userCode}`;
+    const code = `${extensions.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
     const shaderModule = device.createShaderModule({code, label: programInfo.name});
     LOG_DEBUG('verbose', () => `[WebGPU] shader code: ${code}`);
 

From 32f5658abb5f2fc82c981b623d8e7969eb46e14f Mon Sep 17 00:00:00 2001
From: cao lei <jslhcl@gmail.com>
Date: Wed, 13 Sep 2023 21:47:43 -0700
Subject: [PATCH 19/34] remove gsl to make status.h independent from gsl
 (#17402)

### Description
<!-- Describe your changes. -->
Make status.h independent from gsl.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
In the coming new feature external EP API (see the prototype
https://github.com/microsoft/onnxruntime/pull/16718), we need to expose
stream in the public header, however, stream is dependent on status.h
which is dependent on gsl. We are seeking a way to decouple stream from
gsl.

From Changming's comment offline, prefast is disabled so all
GSL_SUPPRESS are not taking any effect now. He will handle the warnings
when enable prefast in the future
---
 include/onnxruntime/core/common/status.h                       | 3 ---
 onnxruntime/core/framework/tuning_context.h                    | 1 +
 .../dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h          | 1 +
 .../core/providers/nnapi/nnapi_builtin/builders/helper.h       | 1 +
 winml/lib/Api.Ort/OnnxruntimeEngine.h                          | 1 +
 5 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/onnxruntime/core/common/status.h b/include/onnxruntime/core/common/status.h
index d6e1992944feb..8f171daabbb1e 100644
--- a/include/onnxruntime/core/common/status.h
+++ b/include/onnxruntime/core/common/status.h
@@ -19,7 +19,6 @@ limitations under the License.
 #ifdef _WIN32
 #include <winerror.h>
 #endif
-#include "core/common/gsl.h"
 namespace onnxruntime {
 namespace common {
 
@@ -121,10 +120,8 @@ class [[nodiscard]] Status {
 
   Status(StatusCategory category, int code);
 
-  GSL_SUPPRESS(r.11)
   Status(const Status& other)
       : state_((other.state_ == nullptr) ? nullptr : new State(*other.state_)) {}
-  GSL_SUPPRESS(r.11)
   Status& operator=(const Status& other) {
     if (state_ != other.state_) {
       if (other.state_ == nullptr) {
diff --git a/onnxruntime/core/framework/tuning_context.h b/onnxruntime/core/framework/tuning_context.h
index b6569a21e4c91..aae70d85814bc 100644
--- a/onnxruntime/core/framework/tuning_context.h
+++ b/onnxruntime/core/framework/tuning_context.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <array>
 #include <unordered_map>
 
 #include "core/common/common.h"
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
index e9c63cc72a837..f94270cfadb8b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -5,6 +5,7 @@
 
 #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h"
 #include "MLOperatorAuthorPrivate.h"
+#include "core/common/gsl.h"
 
 #ifdef ORT_NO_EXCEPTIONS
 #define ML_CHECK_BOOL(x) ORT_THROW_HR_IF(E_INVALIDARG, !(x))
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 421c55a2c91a8..766034b3decea 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -8,6 +8,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/graph/basic_types.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksTypes.h"
+#include "core/common/gsl.h"
 
 // This is the minimal Android API Level required by ORT NNAPI EP to run
 // ORT running on any host system with Android API level less than this will fall back to CPU EP
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h
index 5974d46b82c4f..eae7dc37941c7 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.h
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h
@@ -3,6 +3,7 @@
 
 #include "iengine.h"
 #include "UniqueOrtPtr.h"
+#include "core/common/gsl.h"
 
 #include <memory>
 #include <mutex>

From ad369a1fadb9939b615c442f2a9f40ef5e7b6ec0 Mon Sep 17 00:00:00 2001
From: Hans <me@hans00.me>
Date: Thu, 14 Sep 2023 13:02:27 +0800
Subject: [PATCH 20/34] [js/rn] Support create boolean tensor (#17052)

### Description
<!-- Describe your changes. -->

For some use case need to create boolean tensor.

I've tested on [this
project](https://github.com/hans00/react-native-transformers-example)

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Add handle `ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL`

And it required #15556 (It seems not include in latest release
(v1.15.1))
---
 .../reactnative/TensorHelperTest.java         | 28 +++++++++++++++++++
 .../onnxruntime/reactnative/TensorHelper.java |  6 +++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java b/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
index 76fd608e4362b..72518488e6682 100644
--- a/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
+++ b/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
@@ -238,6 +238,34 @@ public void createInputTensor_double() throws Exception {
     outputTensor.close();
   }
 
+  @Test
+  public void createInputTensor_bool() throws Exception {
+    OnnxTensor outputTensor = OnnxTensor.createTensor(ortEnvironment, new boolean[] {false, true});
+
+    JavaOnlyMap inputTensorMap = new JavaOnlyMap();
+
+    JavaOnlyArray dims = new JavaOnlyArray();
+    dims.pushInt(2);
+    inputTensorMap.putArray("dims", dims);
+
+    inputTensorMap.putString("type", TensorHelper.JsTensorTypeBool);
+
+    ByteBuffer dataByteBuffer = ByteBuffer.allocate(2);
+    dataByteBuffer.put((byte)0);
+    dataByteBuffer.put((byte)1);
+    inputTensorMap.putMap("data", blobModule.testCreateData(dataByteBuffer.array()));
+
+    OnnxTensor inputTensor = TensorHelper.createInputTensor(blobModule, inputTensorMap, ortEnvironment);
+
+    Assert.assertEquals(inputTensor.getInfo().onnxType, TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL);
+    Assert.assertEquals(outputTensor.getInfo().onnxType, TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL);
+    Assert.assertEquals(inputTensor.toString(), outputTensor.toString());
+    Assert.assertArrayEquals(inputTensor.getByteBuffer().array(), outputTensor.getByteBuffer().array());
+
+    inputTensor.close();
+    outputTensor.close();
+  }
+
   @Test
   public void createOutputTensor_bool() throws Exception {
     MockitoSession mockSession = mockitoSession().mockStatic(Arguments.class).startMocking();
diff --git a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
index d9c2e3bac5d9b..63cddace36640 100644
--- a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
+++ b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
@@ -174,7 +174,11 @@ private static OnnxTensor createInputTensor(TensorInfo.OnnxTensorType tensorType
       tensor = OnnxTensor.createTensor(ortEnvironment, buffer, dims, OnnxJavaType.UINT8);
       break;
     }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
+      ByteBuffer buffer = values;
+      tensor = OnnxTensor.createTensor(ortEnvironment, buffer, dims, OnnxJavaType.BOOL);
+      break;
+    }
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:

From 5af6279440a0db698b0afbfc3de655400562831f Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 14 Sep 2023 07:36:01 -0700
Subject: [PATCH 21/34] Fix Android build (#17540)

### Description
The new cpuinfo library doesn't use clog on Android. Newer XNNPack
versions have removed the dependency on clog, but the one we use still
has it. So I cherry-pick the XNNPack to our patch file.
---
 .../xnnpack/AddEmscriptenAndIosSupport.patch  | 48 +++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
index 7296f2f30f286..37bdbf9fb53f6 100644
--- a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
+++ b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
@@ -1,8 +1,8 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d53c48aa1..4c987bd7a 100755
+index d53c48aa1..77c3cf983 100755
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -105,7 +105,7 @@ ENDIF()
+@@ -105,22 +105,12 @@ ENDIF()
  
  IF(NOT CMAKE_SYSTEM_NAME)
    MESSAGE(FATAL_ERROR "CMAKE_SYSTEM_NAME not defined")
@@ -11,7 +11,22 @@ index d53c48aa1..4c987bd7a 100755
    MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME = ${CMAKE_SYSTEM_NAME}")
  ENDIF()
  
-@@ -7108,6 +7108,10 @@ IF(MSVC)
+ # ---[ Download deps
+ IF(NOT XNNPACK_USE_SYSTEM_LIBS)
+-  IF(NOT DEFINED CLOG_SOURCE_DIR)
+-    MESSAGE(STATUS "Downloading clog to ${CMAKE_BINARY_DIR}/clog-source (define CLOG_SOURCE_DIR to avoid it)")
+-    CONFIGURE_FILE(cmake/DownloadCLog.cmake "${CMAKE_BINARY_DIR}/clog-download/CMakeLists.txt")
+-    EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+-      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download")
+-    EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build .
+-      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download")
+-    SET(CLOG_SOURCE_DIR "${CMAKE_BINARY_DIR}/clog-source" CACHE STRING "clog source directory")
+-  ENDIF()
+-
+   IF(NOT DEFINED CPUINFO_SOURCE_DIR)
+     MESSAGE(STATUS "Downloading cpuinfo to ${CMAKE_BINARY_DIR}/cpuinfo-source (define CPUINFO_SOURCE_DIR to avoid it)")
+     CONFIGURE_FILE(cmake/DownloadCpuinfo.cmake "${CMAKE_BINARY_DIR}/cpuinfo-download/CMakeLists.txt")
+@@ -7108,6 +7098,10 @@ IF(MSVC)
    SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O2 >")
    SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O2 >")
    SET_PROPERTY(SOURCE ${COLD_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O1 >")
@@ -22,3 +37,30 @@ index d53c48aa1..4c987bd7a 100755
  ELSE()
    SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: -O2 >")
    SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: -O2 >")
+@@ -7142,26 +7136,6 @@ IF(LIBM)
+   TARGET_LINK_LIBRARIES(indirection PRIVATE ${LIBM})
+ ENDIF()
+ 
+-# ---[ Configure clog
+-IF(NOT TARGET clog)
+-  IF(NOT XNNPACK_USE_SYSTEM_LIBS)
+-    SET(CLOG_BUILD_TESTS OFF CACHE BOOL "")
+-    SET(CLOG_RUNTIME_TYPE "${CPUINFO_RUNTIME_TYPE}" CACHE STRING "")
+-    ADD_SUBDIRECTORY(
+-      "${CLOG_SOURCE_DIR}/deps/clog"
+-      "${CMAKE_BINARY_DIR}/clog")
+-    # We build static version of clog but a dynamic library may indirectly depend on it
+-    SET_PROPERTY(TARGET clog PROPERTY POSITION_INDEPENDENT_CODE ON)
+-  ELSE()
+-    ADD_LIBRARY(clog STATIC IMPORTED)
+-    FIND_LIBRARY(CLOG_LIBRARY clog)
+-    IF(NOT CLOG_LIBRARY)
+-      MESSAGE(FATAL_ERROR "Cannot find clog")
+-    ENDIF()
+-    SET_PROPERTY(TARGET clog PROPERTY IMPORTED_LOCATION "${CLOG_LIBRARY}")
+-  ENDIF()
+-ENDIF()
+-
+ # ---[ Configure cpuinfo
+ IF(NOT TARGET cpuinfo)
+   IF(NOT XNNPACK_USE_SYSTEM_LIBS)

From 7af2f68ef307a66a770a620ad2611e511bb2008f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 14 Sep 2023 10:05:31 -0700
Subject: [PATCH 22/34] [js/web] add a test flag to customize chromium flags
 (#17545)

### Description
add a test flag to customize chromium flags.

Usage:
npm test -- \<other flags> --chromium-flags=<...>
---
 js/web/karma.conf.js                          | 55 +++++++------------
 js/web/script/test-runner-cli-args.ts         | 16 +++++-
 js/web/script/test-runner-cli.ts              | 46 +++++++++-------
 .../azure-pipelines/templates/win-web-ci.yml  |  2 +-
 4 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index 63c6f5bb045a7..35f782d1fdca3 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -3,10 +3,22 @@
 
 'use strict';
 
-const bundleMode = require('minimist')(process.argv)['bundle-mode'] || 'dev';  // 'dev'|'perf'|undefined;
-const karmaPlugins = require('minimist')(process.argv)['karma-plugins'] || undefined;
-const timeoutMocha = require('minimist')(process.argv)['timeout-mocha'] || 60000;
-const forceLocalHost = !!require('minimist')(process.argv)['force-localhost'];
+const args = require('minimist')(process.argv, {});
+const bundleMode = args['bundle-mode'] || 'dev';  // 'dev'|'perf'|undefined;
+const karmaPlugins = args['karma-plugins'] || undefined;
+const timeoutMocha = args['timeout-mocha'] || 60000;
+const forceLocalHost = !!args['force-localhost'];
+
+// parse chromium flags
+let chromiumFlags = args['chromium-flags'];
+if (!chromiumFlags) {
+  chromiumFlags = [];
+} else if (typeof chromiumFlags === 'string') {
+  chromiumFlags = [chromiumFlags];
+} else if (!Array.isArray(chromiumFlags)) {
+  throw new Error(`Invalid command line arg: --chromium-flags: ${chromiumFlags}`);
+}
+
 const commonFile = bundleMode === 'dev' ? '../common/dist/ort-common.js' : '../common/dist/ort-common.min.js'
 const mainFile = bundleMode === 'dev' ? 'test/ort.dev.js' : 'test/ort.perf.js';
 
@@ -91,37 +103,10 @@ module.exports = function(config) {
     listenAddress,
     customLaunchers: {
       // the following flags are used to make sure Edge on CI agents to initialize WebGPU correctly.
-      EdgeWebGpuTest: {base: 'Edge', flags: ['--ignore-gpu-blocklist', '--gpu-vendor-id=0x10de']},
-      ChromeTest: {base: 'Chrome', flags: ['--enable-features=SharedArrayBuffer']},
-      ChromeTestHeadless: {base: 'ChromeHeadless', flags: ['--enable-features=SharedArrayBuffer']},
-      ChromeDebug:
-          {debug: true, base: 'Chrome', flags: ['--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer']},
-      ChromeCanaryTest: {
-        base: 'ChromeCanary',
-        flags: ['--enable-features=SharedArrayBuffer', '--enable-experimental-web-platform-features']
-      },
-      ChromeCanaryDebug: {
-        debug: true,
-        base: 'ChromeCanary',
-        flags: [
-          '--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer',
-          '--enable-experimental-web-platform-features'
-        ]
-      },
-      ChromeWebGpuProfileTest: {
-        base: 'Chrome',
-        flags:
-            ['--window-size=1,1', '--enable-features=SharedArrayBuffer', '--disable-dawn-features=disallow_unsafe_apis']
-      },
-      ChromeWebGpuProfileDebug: {
-        debug: true,
-        base: 'Chrome',
-        flags: [
-          '--remote-debugging-port=9333',
-          '--enable-features=SharedArrayBuffer',
-          '--disable-dawn-features=disallow_unsafe_apis',
-        ]
-      },
+      EdgeTest: {base: 'Edge', flags: chromiumFlags},
+      ChromeTest: {base: 'Chrome', flags: chromiumFlags},
+      ChromeTestHeadless: {base: 'ChromeHeadless', flags: chromiumFlags},
+      ChromeCanaryTest: {base: 'ChromeCanary', flags: chromiumFlags},
       //
       // ==== BrowserStack browsers ====
       //
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index e34529fa1037d..7b41850948149 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -80,6 +80,7 @@ Options:
 
  --no-sandbox                  This flag will be passed to Chrome.
                                  Sometimes Chrome need this flag to work together with Karma.
+ --chromium-flags=<...>        This flag will be passed to Chrome and Edge browsers. Can be used multiple times.
 
 Examples:
 
@@ -173,6 +174,7 @@ export interface TestRunnerCliArgs {
   webglOptions?: InferenceSession.WebGLExecutionProviderOption;
   globalEnvFlags?: Test.Options['globalEnvFlags'];
   noSandbox?: boolean;
+  chromiumFlags: string[];
 }
 
 
@@ -439,6 +441,17 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   // Option: --no-sandbox
   const noSandbox = !!args['no-sandbox'];
 
+  // parse chromium flags
+  let chromiumFlags = args['chromium-flags'];
+  if (!chromiumFlags) {
+    chromiumFlags = [];
+  } else if (typeof chromiumFlags === 'string') {
+    chromiumFlags = [chromiumFlags];
+  } else if (!Array.isArray(chromiumFlags)) {
+    throw new Error(`Invalid command line arg: --chromium-flags: ${chromiumFlags}`);
+  }
+
+
   npmlog.verbose('TestRunnerCli.Init', ` Mode:              ${mode}`);
   npmlog.verbose('TestRunnerCli.Init', ` Env:               ${env}`);
   npmlog.verbose('TestRunnerCli.Init', ` Debug:             ${debug}`);
@@ -462,6 +475,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     webglOptions,
     wasmOptions,
     globalEnvFlags,
-    noSandbox
+    noSandbox,
+    chromiumFlags
   };
 }
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index fa4312ee0aaf3..520ef62b2c719 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -475,10 +475,12 @@ async function main() {
           args.bundleMode === 'perf' ? 'perf' :
               args.debug             ? 'debug' :
                                        'test',
-          webgpu, webnn, config.options.globalEnvFlags?.webgpu?.profilingMode === 'default');
+          webgpu, webnn);
       const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
+      const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
       if (args.debug) {
         karmaArgs.push('--log-level info --timeout-mocha 9999999');
+        chromiumFlags.push('--remote-debugging-port=9333');
       } else {
         karmaArgs.push('--single-run');
       }
@@ -488,7 +490,22 @@ async function main() {
       if (webgpu || webnn) {
         karmaArgs.push('--force-localhost');
       }
+      if (webgpu) {
+        if (browser.includes('Canary')) {
+          chromiumFlags.push('--enable-dawn-features=allow_unsafe_apis,use_dxc');
+        } else {
+          chromiumFlags.push('--enable-dawn-features=use_dxc');
+          chromiumFlags.push('--disable-dawn-features=disallow_unsafe_apis');
+        }
+      }
+      if (webnn) {
+        chromiumFlags.push('--enable-experimental-web-platform-features');
+      }
+      if (config.options.globalEnvFlags?.webgpu?.profilingMode === 'default') {
+        chromiumFlags.push('--disable-dawn-features=disallow_unsafe_apis');
+      }
       karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
+      karmaArgs.push(...chromiumFlags.map(flag => `--chromium-flags=${flag}`));
       if (browser.startsWith('Edge')) {
         // There are currently 2 Edge browser launchers:
         //  - karma-edge-launcher: used to launch the old Edge browser
@@ -580,12 +597,12 @@ async function main() {
   }
 
   function getBrowserNameFromEnv(
-      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean, profile: boolean) {
+      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
     switch (env) {
       case 'chrome':
-        return selectChromeBrowser(mode, webgpu, webnn, profile);
+        return selectChromeBrowser(mode, webgpu, webnn);
       case 'edge':
-        return webgpu ? 'EdgeWebGpuTest' : 'Edge';
+        return 'EdgeTest';
       case 'firefox':
         return 'Firefox';
       case 'electron':
@@ -599,25 +616,14 @@ async function main() {
     }
   }
 
-  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean, profile: boolean) {
-    if (webgpu) {
-      switch (mode) {
-        case 'debug':
-          return profile ? 'ChromeWebGpuProfileDebug' : 'ChromeDebug';
-        default:
-          return profile ? 'ChromeWebGpuProfileTest' : 'ChromeTest';
-      }
-    } else if (webnn) {
-      switch (mode) {
-        case 'debug':
-          return 'ChromeCanaryDebug';
-        default:
-          return 'ChromeCanaryTest';
-      }
+  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
+    if (webnn) {
+      return 'ChromeCanaryTest';
+    } else if (webgpu) {
+      return 'ChromeTest';
     } else {
       switch (mode) {
         case 'debug':
-          return 'ChromeDebug';
         case 'perf':
           return 'ChromeTest';
         default:
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 713396dd64532..bad7448715936 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -161,7 +161,7 @@ jobs:
     displayName: 'Run ort-web tests (wasm,webgl,xnnpack backend)'
     condition: ne('${{ parameters.RunWebGpuTests }}', 'true')
   - script: |
-     npm test -- -e=edge -b=webgl,wasm,xnnpack,webgpu
+     npm test -- -e=edge -b=webgl,wasm,xnnpack,webgpu --chromium-flags=--ignore-gpu-blocklist --chromium-flags=--gpu-vendor-id=0x10de
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
     condition: ne('${{ parameters.RunWebGpuTests }}', 'false')

From e11849e716d7d63d0ff2eee8e66b93406c1ef547 Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Thu, 14 Sep 2023 10:58:25 -0700
Subject: [PATCH 23/34] Configure StringNormalizer `default_locale` for _APPLE_
 system  (#17339)

### Description
<!-- Describe your changes. -->

As title.

iOS language code uses different syntax for specifying language
code/region code:
https://developer.apple.com/documentation/xcode/choosing-localization-regions-and-scripts

current `default_locale` is not working for iOS.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Issue:
https://github.com/microsoft/onnxruntime/issues/17017

---------

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../core/providers/cpu/nn/string_normalizer.cc        | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc b/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
index 330b92d4a8f78..27407e999945a 100644
--- a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
+++ b/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
@@ -201,7 +201,16 @@ class Utf8Converter {
 
 #endif
 
-const std::string default_locale("en_US.UTF-8");  // All non-MS
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE || TARGET_OS_SIMULATOR
+const std::string default_locale("en-US.UTF-8");
+#else
+const std::string default_locale("en_US.UTF-8");  // Other kinds of Apple Platforms including MacOS, etc
+#endif
+#else
+const std::string default_locale("en_US.UTF-8");  // All non-MS and not Apple
+#endif
 
 #endif  // _MSC_VER
 

From 198d4688490e8dfd447ba2cc7c99bb30c9b0ef92 Mon Sep 17 00:00:00 2001
From: xhcao <xinghua.cao@intel.com>
Date: Fri, 15 Sep 2023 04:14:11 +0800
Subject: [PATCH 24/34] [WebGPU/JS] Added Pad operator support (#16928)

### Description
<!-- Describe your changes. -->



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/docs/webgpu-operators.md               |   1 +
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   2 +
 js/web/lib/wasm/jsep/webgpu/ops/pad.ts        | 252 ++++++++++++++++++
 js/web/test/suite-test-list.jsonc             |  11 +-
 .../providers/js/js_execution_provider.cc     |  12 +
 .../core/providers/js/operators/pad.cc        |  72 +++++
 onnxruntime/core/providers/js/operators/pad.h |  34 +++
 7 files changed, 379 insertions(+), 5 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/pad.ts
 create mode 100644 onnxruntime/core/providers/js/operators/pad.cc
 create mode 100644 onnxruntime/core/providers/js/operators/pad.h

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index de53f943bc9ef..71d98f5d73671 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -59,6 +59,7 @@ Do not modify directly.*
 | Mul | ai.onnx(7-12,13,14+) |  |
 | Neg | ai.onnx(6-12,13+) |  |
 | Not | ai.onnx(1+) |  |
+| Pad | ai.onnx(2-10,11-12,13-17,18,19+) |  |
 | Pow | ai.onnx(7-11,12,13-14,15+) |  |
 | Reciprocal | ai.onnx(6-12,13+) |  |
 | ReduceL1 | ai.onnx(1-10,11-12,13-17,18+) |  |
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 9c46b97694903..e92e6696d9a78 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -14,6 +14,7 @@ import {gemm, parseGemmAttributes} from './ops/gemm';
 import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
 import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
+import {pad, parsePadAttributes} from './ops/pad';
 import * as pool from './ops/pool';
 import {parseReduceAttributes, reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
 import {parseResizeAttributes, resize} from './ops/resize';
@@ -80,6 +81,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Mul', [binaryOps.mul]],
   ['Neg', [unaryOps.neg]],
   ['Not', [unaryOps.not]],
+  ['Pad', [pad, parsePadAttributes]],
   ['Pow', [binaryOps.pow]],
   ['Reciprocal', [unaryOps.reciprocal]],
   ['ReduceMin', [reduceMin, parseReduceAttributes]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
new file mode 100644
index 0000000000000..d90296b5c5a46
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -0,0 +1,252 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+
+export interface PadAttributes extends AttributeWithCacheKey {
+  // 0-constant, 1-reflect, 2-edge, 3-wrap
+  readonly mode: number;
+  readonly value: number;
+  readonly pads: number[];
+}
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs || inputs.length < 1) {
+    throw new Error('Too few inputs');
+  }
+  if (inputs[0].dataType !== DataType.float) {
+    throw new Error('Input type must be float.');
+  }
+
+  if (inputs.length >= 2) {
+    let validPads = inputs[0].dims.length * 2 === inputs[1].dims[0];
+    if (inputs.length === 4) {
+      validPads = inputs[3].dims[0] * 2 === inputs[1].dims[0];
+    }
+    if (!validPads) {
+      throw new Error('The pads should be a 1D tensor of shape [2 * input_rank] or [2 * num_axes].');
+    }
+  }
+};
+
+const getPadConstant =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[], dataType: string, constantValue: number): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+            k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+            if (k < 0) {
+              break;
+            }
+            if (k >= ${inputDims[i]}) {
+              break;
+            }
+            offset += k * ${inputStrides[i]};
+        `;
+      }
+
+      return `
+          value = ${dataType}(${constantValue});
+          for (var i = 0; i < 1; i++) {
+            var offset = 0;
+            var k = 0;
+            ${block}
+            value = x[offset];
+          }
+      `;
+    };
+
+const getPadReflect =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[]): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+                if (k < 0) {
+                  k = -k;
+                }
+                {
+                  let _2n_1 = ${2 * (inputDims[i] - 1)};
+                  k = k % _2n_1;
+                  if(k >= ${inputDims[i]}) {
+                    k = _2n_1 - k;
+                  }
+                }
+                offset += k * ${inputStrides[i]};
+            `;
+      }
+
+      return `
+              var offset = 0;
+              var k = 0;
+              ${block}
+              value = x[offset];
+          `;
+    };
+
+const getPadEdge =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[]): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+                if (k < 0) {
+                  k = 0;
+                }
+                if (k >= ${inputDims[i]}) {
+                  k = ${inputDims[i] - 1};
+                }
+                offset += k * ${inputStrides[i]};
+            `;
+      }
+
+      return `
+              var offset = 0;
+              var k = 0;
+              ${block}
+              value = x[offset];
+          `;
+    };
+
+const getPadWrap =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[]): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+                if (k < 0)  {
+                  k += ${inputDims[i]};
+                }
+                if (k >= ${inputDims[i]}) {
+                  k -= ${inputDims[i]};
+                }
+                offset += k * ${inputStrides[i]};
+            `;
+      }
+
+      return `
+              var offset = 0;
+              var k = 0;
+              ${block}
+              value = x[offset];
+          `;
+    };
+
+const getPadSnippet =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], attributes: PadAttributes, dataType: string): string => {
+      switch (attributes.mode) {
+        case 0:
+          return getPadConstant(
+              output, outputDims, inputDims, inputStrides, attributes.pads, dataType, attributes.value);
+        case 1:
+          return getPadReflect(output, outputDims, inputDims, inputStrides, attributes.pads);
+        case 2:
+          return getPadEdge(output, outputDims, inputDims, inputStrides, attributes.pads);
+        case 3:
+          return getPadWrap(output, outputDims, inputDims, inputStrides, attributes.pads);
+        default:
+          throw new Error('Invalid mode');
+      }
+    };
+
+const generatePadCode =
+    (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: PadAttributes, dataType: string):
+        string => {
+          const inputDims = inputs[0].dims;
+          const outputDims = ShapeUtil.padShape(inputDims.slice(), attributes.pads);
+          const outputSize = ShapeUtil.size(outputDims);
+          const inputStrides = ShapeUtil.computeStrides(inputDims);
+
+          const output = outputVariable('output', inputs[0].dataType, outputDims);
+          const input = inputVariable('x', inputs[0].dataType, inputDims);
+
+          const padSnippet = getPadSnippet(output, outputDims, inputDims, inputStrides, attributes, dataType);
+          const padCode = `
+              ${shaderHelper.declareVariables(input, output)}
+              ${output.impl()}
+              ${shaderHelper.mainStart()}
+              ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+
+              let indices = ${output.offsetToIndices('global_idx')};
+
+              var value = ${dataType}(0);
+              ${padSnippet}
+              output[global_idx] = value;
+          }`;
+          return padCode;
+        };
+
+const createPadProgramInfo =
+    (inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: PadAttributes): ProgramInfo => {
+      const outputShape = ShapeUtil.padShape(inputs[0].dims.slice(), attributes.pads);
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        getShaderSource: shaderHelper => generatePadCode(shaderHelper, inputs, attributes, 'f32'),
+        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+      };
+    };
+
+const createPadAttributesFromInputs = (inputs: readonly TensorView[], attributes: PadAttributes): PadAttributes => {
+  if (inputs.length > 1) {
+    const bigInt64Pads = inputs[1].getBigInt64Array();
+    const value = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : 0.0;
+
+    const inputRank = inputs[0].dims.length;
+    const updatePads = new Int32Array(2 * inputRank).fill(0);
+    if (inputs.length >= 4) {
+      const axes = inputs[3].getBigInt64Array();
+      for (let i = 0; i < axes.length; i++) {
+        updatePads[Number(axes[i])] = Number(bigInt64Pads[i]);
+        updatePads[Number(axes[i]) + inputRank] = Number(bigInt64Pads[i + axes.length]);
+      }
+    } else {
+      bigInt64Pads.forEach((i, v) => updatePads[Number(i)] = (Number(v)));
+    }
+
+    const pads: number[] = [];
+    updatePads.forEach(v => pads.push(v));
+
+    return createAttributeWithCacheKey({mode: attributes.mode, value, pads});
+  } else {
+    return attributes;
+  }
+};
+
+const createPadProgramInfoLoader = (inputs: readonly TensorView[], attributes: PadAttributes): ProgramInfoLoader => {
+  const updatedAttributes = createPadAttributesFromInputs(inputs, attributes);
+  const metadata:
+      ProgramMetadata = {name: 'Pad', inputTypes: [GpuDataType.default], cacheHint: updatedAttributes.cacheKey};
+  return {...metadata, get: () => createPadProgramInfo(inputs, metadata, updatedAttributes)};
+};
+
+export const pad = (context: ComputeContext, attributes: PadAttributes): void => {
+  validateInputs(context.inputs);
+  context.compute(createPadProgramInfoLoader(context.inputs, attributes), {inputs: [0]});
+};
+
+export const parsePadAttributes = (attributes: Record<string, unknown>): PadAttributes => {
+  const mode = attributes.mode as number;
+  const value = attributes.value as number;
+  const pads = attributes.pads as number[];
+  return createAttributeWithCacheKey({mode, value, pads});
+};
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index f4249b24101e5..e580259071968 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -505,7 +505,7 @@
       // // "test_dynamicquantizelinear_min_adjusted_expanded",
       // // "test_dynamicquantizelinear_min_adjusted",
       // // "test_dynamicquantizelinear",
-      // // "test_edge_pad",
+      "test_edge_pad",
       // "test_einsum_batch_diagonal",
       // "test_einsum_batch_matmul",
       // "test_einsum_inner_prod",
@@ -965,7 +965,7 @@
       "test_reduce_sum_square_keepdims_random",
       "test_reduce_sum_square_negative_axes_keepdims_example",
       "test_reduce_sum_square_negative_axes_keepdims_random",
-      // // "test_reflect_pad",
+      "test_reflect_pad",
       "test_relu",
       // "test_reshape_allowzero_reordered",
       "test_reshape_extended_dims",
@@ -1308,7 +1308,8 @@
       "test_unsqueeze_three_axes",
       "test_unsqueeze_two_axes",
       "test_unsqueeze_unsorted_axes",
-      "test_unsqueeze"
+      "test_unsqueeze",
+      "test_wrap_pad"
       // "test_upsample_nearest",
       // "test_where_example",
       // "test_where_long_example",
@@ -1361,8 +1362,8 @@
       "reduce-min.jsonc",
       "relu.jsonc",
       "gelu.jsonc",
-      //"pad.jsonc",
-      //"pad-big.jsonc",
+      "pad.jsonc",
+      "pad-big.jsonc",
       "pow.jsonc",
       "pow_int32.jsonc",
       "pow-big-number.jsonc",
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 6b548921cdc8c..9dccd7c47fbb6 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -321,6 +321,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, Einsum);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 2, 10, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Pad);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Pad);
+
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
@@ -577,6 +583,12 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, Einsum)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Pad)>,
+
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/js/operators/pad.cc b/onnxruntime/core/providers/js/operators/pad.cc
new file mode 100644
index 0000000000000..24ba85cbf6e0d
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/pad.cc
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+#include "pad.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    2,
+    10,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Pad);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    11,
+    12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    13,
+    17,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    18,
+    18,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    19,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
new file mode 100644
index 0000000000000..19168f40b4722
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/cpu/tensor/padbase.h"
+
+namespace onnxruntime {
+namespace js {
+
+class Pad : public JsKernel, public PadBase {
+ public:
+  explicit Pad(const OpKernelInfo& info) : JsKernel(info), PadBase(info) {
+    std::vector<int32_t> pads;
+    if (!is_dynamic_) {
+      pads.resize(pads_.size());
+      for (size_t i = 0; i < pads_.size(); ++i) {
+        pads[i] = gsl::narrow_cast<int32_t>(pads_[i]);
+      }
+    }
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(Pad, ({"mode" : $1,
+                                      "value" : $2,
+                                      "pads" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : []}),
+                               static_cast<int32_t>(mode_),
+                               static_cast<double>(value_),
+                               gsl::narrow_cast<int32_t>(pads.size()),
+                               reinterpret_cast<int32_t>((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime

From 46fe08226feb95ce03b6a4773d59b9786f14a3df Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 14 Sep 2023 14:22:45 -0700
Subject: [PATCH 25/34] [QNN EP] Enable Pad op support for QNN EP (#17508)

### Description
Enable Pad op support for QNN EP to support more models
---
 .../selectors_actions/qdq_selectors.cc        |  25 ++
 .../selectors_actions/qdq_selectors.h         |  10 +
 .../selectors_actions/shared/utils.cc         |  11 +
 .../qnn/builder/op_builder_factory.cc         |   4 +
 .../qnn/builder/op_builder_factory.h          |   2 +
 .../qnn/builder/opbuilder/base_op_builder.h   |   4 +-
 .../builder/opbuilder/gather_op_builder.cc    |   1 -
 .../qnn/builder/opbuilder/pad_op_builder.cc   | 247 +++++++++++++
 .../builder/opbuilder/simple_op_builder.cc    |   4 +-
 .../providers/qnn/builder/qnn_model_wrapper.h |   1 +
 .../test/providers/qnn/pad_op_test.cpp        | 346 ++++++++++++++++++
 11 files changed, 651 insertions(+), 4 deletions(-)
 create mode 100644 onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
 create mode 100644 onnxruntime/test/providers/qnn/pad_op_test.cpp

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 565afcc67e7df..02a7fb733813c 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -330,6 +330,31 @@ bool WhereNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node&
          dt_input_1 == dt_output;
 }
 
+bool PadNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node,
+                                 const std::vector<const Node*>& dq_nodes,
+                                 const std::vector<const Node*>& q_nodes) const {
+  // Pad can have 1 or 2 dq input, the optional input constant_value can be quantized or non-quantized.
+  // QNN supports data input quantized with constant_value input non-quantized.
+  int num_dq_inputs = static_cast<int>(dq_nodes.size());
+  if (num_dq_inputs > 2) {
+    return false;
+  }
+
+  if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes, num_dq_inputs)) {
+    return false;
+  }
+
+  const int32_t dt_input_1 = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  const int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  if (dq_nodes.size() > 1) {
+    const int32_t dt_input_2 = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    return dt_input_1 == dt_input_2 &&
+           dt_input_1 == dt_output;
+  } else {
+    return dt_input_1 == dt_output;
+  }
+}
+
 bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& graph_viewer,
                                                            const Node& node,
                                                            const std::vector<const Node*>& dq_nodes,
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index ab9ad45697dfa..58ebf81508962 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -110,6 +110,16 @@ class WhereNodeGroupSelector : public NodeGroupSelector {
              const std::vector<const Node*>& q_nodes) const override;
 };
 
+class PadNodeGroupSelector : public NodeGroupSelector {
+ public:
+  PadNodeGroupSelector() = default;
+
+ private:
+  bool Check(const GraphViewer& graph_viewer, const Node& node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+};
+
 // 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not
 // The lack of a trailing Q isn't really a QDQ node group, so we default support for that to off.
 class MatMulNodeGroupSelector : public NodeGroupSelector {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 7783d3b3f36b7..f1bdd7a99c329 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -123,6 +123,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetLogicalComparisonOpVersions
 static const OpVersionsAndSelector::OpVersionsMap GetWhereOpVersionsMap() {
   return {{"Where", {}}};
 }
+static const OpVersionsAndSelector::OpVersionsMap GetPadOpVersionsMap() {
+  return {{"Pad", {}}};
+}
 
 /* Selector rules registration related */
 void RegisterMiscSelectors(Selectors& qdq_selectors) {
@@ -217,6 +220,13 @@ void RegisterWhereSelectors(Selectors& qdq_selectors) {
                                  std::move(selector));
 }
 
+void RegisterPadSelectors(Selectors& qdq_selectors) {
+  /* register selectors for Pad ops */
+  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<PadNodeGroupSelector>();
+  qdq_selectors.RegisterSelector(GetPadOpVersionsMap(),
+                                 std::move(selector));
+}
+
 void SelectorManager::CreateSelectors() {
   RegisterMiscSelectors(qdq_selectors_);
   RegisterDropDQSelectors(qdq_selectors_);
@@ -231,6 +241,7 @@ void SelectorManager::CreateSelectors() {
   RegisterBatchNormalizationSelector(qdq_selectors_);
   RegisterLogicalComparisonSelectors(qdq_selectors_);
   RegisterWhereSelectors(qdq_selectors_);
+  RegisterPadSelectors(qdq_selectors_);
 }
 
 void SelectorManager::InitializeSelectorsMap() {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 58ac3ad45a577..fc8c2efc7a80f 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -154,6 +154,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
   {
     CreateTransposeOpBuilder("Transpose", *this);
   }
+
+  {
+    CreatePadOpBuilder("Pad", *this);
+  }
 }
 
 const IOpBuilder* GetOpBuilder(const std::string& onnx_op_type) {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index 36cf0e7ff5ac0..5d59f4343d773 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -88,5 +88,7 @@ void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_r
 
 void CreateTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 14d5e45799b81..0431d605bc843 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -162,7 +162,9 @@ class BaseOpBuilder : public IOpBuilder {
         {"BatchNormalization", QNN_OP_BATCHNORM},
         {"LayerNormalization", QNN_OP_LAYER_NORM},
 
-        {"LRN", QNN_OP_LRN}};
+        {"LRN", QNN_OP_LRN},
+
+        {"Pad", QNN_OP_PAD}};
     auto it = onnx_op_type_to_qnn_op_type.find(onnx_op_type);
     ORT_ENFORCE(it != onnx_op_type_to_qnn_op_type.end());
     return it->second;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index bd07c099b3cfe..e203667576447 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -13,7 +13,6 @@
 namespace onnxruntime {
 namespace qnn {
 
-// Operator which only need to hanle node inputs & outputs, no attributes or no need to handle attributes
 class GatherOpBuilder : public BaseOpBuilder {
  public:
   GatherOpBuilder() : BaseOpBuilder("GatherOpBuilder") {}
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
new file mode 100644
index 0000000000000..2dfdfffe5fa54
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -0,0 +1,247 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/cpu/tensor/slice_helper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/common/safeint.h"
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+
+namespace onnxruntime {
+namespace qnn {
+class PadOpBuilder : public BaseOpBuilder {
+ public:
+  PadOpBuilder() : BaseOpBuilder("PadOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PadOpBuilder);
+
+ protected:
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+};
+
+Status PadOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                   const NodeUnit& node_unit,
+                                   const logging::Logger& logger,
+                                   std::vector<std::string>& input_names,
+                                   bool do_op_validation) const {
+  const auto& inputs = node_unit.Inputs();
+  // QNN Pad only has 1 input, the pads input & constant_value input need to be initializer and set as Qnn node parameter, axes input is not supported.
+  if (do_op_validation) {
+    ORT_RETURN_IF(inputs.size() > 3, "QNN Pad doesn't support axes.");
+    ORT_RETURN_IF(inputs.size() < 2, "QNN Pad requires the pads input.");
+
+    std::vector<uint32_t> input_shape;
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0.");
+    ORT_RETURN_IF(input_shape.size() > 5, "QNN Pad doesn't support more than 5 dimension");
+
+    auto& pads_input_name = inputs[1].node_arg.Name();
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.IsInitializerInput(pads_input_name),
+                      "Qnn doesn't support dynamic pad input");
+    if (node_unit.Inputs().size() > 2) {
+      auto& constant_value_input_name = inputs[2].node_arg.Name();
+      ORT_RETURN_IF_NOT(qnn_model_wrapper.IsInitializerInput(constant_value_input_name),
+                        "Qnn doesn't support dynamic constant_value input");
+    }
+  }
+
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
+
+  return Status::OK();
+}
+
+template <typename T>
+float DequantizeValue(T value, int32_t offset, float scale) {
+  return static_cast<float>(static_cast<int32_t>(value) - offset) * scale;
+}
+
+Status ProcessConstantValue(QnnModelWrapper& qnn_model_wrapper,
+                            std::vector<std::string>& param_tensor_names,
+                            const NodeUnit& node_unit,
+                            const NodeUnitIODef& input) {
+  OnnxInputInfo input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(input, input_info));
+  std::vector<uint8_t> unpacked_tensor;
+  // Already confirmed constant_value input is initializer in ProcessInputs()
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, unpacked_tensor));
+  Qnn_Scalar_t constant_value_qnn_scalar = QNN_SCALAR_INIT;
+  // constant_value is quantized
+  if (input.quant_param.has_value()) {
+    // QNN prefers pad_constant_value quantized with quantization params same as in[0], and data stored as 32-bit signed integer
+    // Onnx doesn't guarantee it has same quantization parameter as in[0], so get back the float32 value and use non-quantized data directly
+    constant_value_qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
+    float constant_value = 0;
+    switch (input_info.qnn_data_type) {
+      case QNN_DATATYPE_SFIXED_POINT_8: {
+        auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(int8_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_SFIXED_POINT_16: {
+        auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(int16_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_SFIXED_POINT_32: {
+        auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(int32_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_UFIXED_POINT_8: {
+        constant_value = DequantizeValue(unpacked_tensor.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_UFIXED_POINT_16: {
+        auto uint16_span = ReinterpretAsSpan<const uint16_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(uint16_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_UFIXED_POINT_32: {
+        auto uint32_span = ReinterpretAsSpan<const uint32_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(uint32_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      default:
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Pad constant_value.");
+    }
+    constant_value_qnn_scalar.floatValue = constant_value;
+  } else {  // constant_value is non-quantized
+    constant_value_qnn_scalar.dataType = input_info.qnn_data_type;
+    switch (input_info.qnn_data_type) {
+      case QNN_DATATYPE_UINT_8: {
+        constant_value_qnn_scalar.uint8Value = unpacked_tensor.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_8: {
+        auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int8Value = int8_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_16: {
+        auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int16Value = int16_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_32: {
+        auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int32Value = int32_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_64: {
+        auto int64_span = ReinterpretAsSpan<const int64_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int64Value = int64_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_FLOAT_32: {
+        auto float_span = ReinterpretAsSpan<const float>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.floatValue = float_span.data()[0];
+        break;
+      }
+      default:
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported.");
+    }  // switch
+  }    // if-else
+
+  QnnParamWrapper constant_value_param(node_unit.Index(),
+                                       node_unit.Name(),
+                                       QNN_OP_PAD_PARAM_PAD_CONSTANT_VALUE,
+                                       constant_value_qnn_scalar);
+  param_tensor_names.push_back(constant_value_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(constant_value_param));
+
+  return Status::OK();
+}
+
+Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                 const NodeUnit& node_unit,
+                                                 std::vector<std::string>&& input_names,
+                                                 const logging::Logger& logger,
+                                                 bool do_op_validation) const {
+  std::vector<std::string> param_tensor_names;
+  // Process pads input
+  // Already confirmed pads input is initializer in ProcessInputs()
+  const auto& inputs = node_unit.Inputs();
+  const auto& pads_input_name = inputs[1].node_arg.Name();
+
+  std::vector<uint8_t> unpacked_tensor;
+  const auto& input_tensor = qnn_model_wrapper.GetInitializerTensors().at(pads_input_name);
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_tensor, unpacked_tensor));
+  // Onnx Pads are int64, Qnn use uint32
+  const int64_t* tensor_data = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
+  size_t tensor_byte_size = unpacked_tensor.size();
+  size_t size = tensor_byte_size / sizeof(int64_t);
+
+  std::vector<uint32_t> pad_amount;
+  std::transform(tensor_data, tensor_data + size, std::back_inserter(pad_amount),
+                 [](int64_t item) { return SafeInt<uint32_t>(item); });
+  // Onnx format is begin_0, begin_1, ..., end_0, end_1, ...
+  // Qnn format is begin_0, end_0, begin_1, end_1, ...
+  ReArranagePads(pad_amount);
+
+  std::vector<uint32_t> pad_amount_dim{static_cast<uint32_t>(pad_amount.size() / 2), static_cast<uint32_t>(2)};
+  QnnParamWrapper multiples_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_PAD_AMOUNT, std::move(pad_amount_dim),
+                                  std::move(pad_amount));
+  param_tensor_names.push_back(multiples_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(multiples_param));
+
+  // Process optional input constant_value
+  if (node_unit.Inputs().size() > 2) {
+    ORT_RETURN_IF_ERROR(ProcessConstantValue(qnn_model_wrapper, param_tensor_names, node_unit, inputs[2]));
+  }  // constant_value
+
+  NodeAttrHelper node_helper(node_unit);
+  std::string mode = node_helper.Get("mode", "constant");
+  Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
+  mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
+  if ("constant" == mode) {
+    mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_CONSTANT;
+  } else if ("reflect" == mode) {
+    mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_MIRROR_REFLECT;
+  } else if ("edge" == mode) {
+    mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_EDGE;
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad mode only support constant.");
+  }
+
+  QnnParamWrapper mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_SCHEME, mode_qnn_scalar);
+  param_tensor_names.push_back(mode_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(mode_param));
+
+  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
+                                     std::move(input_names),
+                                     std::move(param_tensor_names),
+                                     logger, do_op_validation, GetQnnOpType(node_unit.OpType())));
+
+  return Status::OK();
+}
+
+void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<PadOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index 8abb847b20b46..556a86bb1519b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -118,9 +118,9 @@ Status ProcessModeAttribute(QnnModelWrapper& qnn_model_wrapper,
   Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
   mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
   if ("DCR" == mode) {
-    mode_qnn_scalar.uint32Value = 0;
+    mode_qnn_scalar.uint32Value = QNN_OP_DEPTH_TO_SPACE_MODE_DCR;
   } else if ("CRD" == mode) {
-    mode_qnn_scalar.uint32Value = 1;  // CRD mode
+    mode_qnn_scalar.uint32Value = QNN_OP_DEPTH_TO_SPACE_MODE_CRD;  // CRD mode
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DepthToSpace mode only support DCR & CRD.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 1f54bda9107c7..22f8d3a0eaa64 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -117,6 +117,7 @@ class QnnModelWrapper {
     return input_index_map_.find(tensor_name) != input_index_map_.end();
   }
 
+  // TODO(hecli) rename to GetTensorInfo
   Status GetOnnxInputInfo(const NodeUnitIODef& input, OnnxInputInfo& input_info) const;
 
   Status AddReshapeNode(const std::string& input_name,
diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp
new file mode 100644
index 0000000000000..95961e423833a
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp
@@ -0,0 +1,346 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include <unordered_map>
+
+#include "core/graph/node_attr_utils.h"
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "onnx/onnx_pb.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Returns a function that creates a graph with a single Pad operator.
+static GetTestModelFn BuildPadTestCase(const TestInputDef<float>& data_def,
+                                       const TestInputDef<int64_t>& pads_def,
+                                       const TestInputDef<float>& constant_value_def,
+                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                       bool has_constant_value = true) {
+  return [data_def, pads_def, constant_value_def, attrs, has_constant_value](ModelTestBuilder& builder) {
+    NodeArg* data = MakeTestInput(builder, data_def);
+    NodeArg* pads = MakeTestInput(builder, pads_def);
+    std::vector<NodeArg*> inputs{data, pads};
+    if (has_constant_value) {
+      NodeArg* constant_value = MakeTestInput(builder, constant_value_def);
+      inputs.push_back(constant_value);
+    }
+    NodeArg* output = builder.MakeOutput();
+    Node& pad_node = builder.AddNode("Pad", inputs, {output});
+
+    for (const auto& attr : attrs) {
+      pad_node.AddAttributeProto(attr);
+    }
+  };
+}
+
+// Returns a function that creates a graph with a QDQ Pad operator.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildPadQDQTestCase(const TestInputDef<float>& data_def,
+                                                 const TestInputDef<int64_t>& pads_def,
+                                                 const TestInputDef<float>& constant_value_def,
+                                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                 bool has_constant_value,
+                                                 bool constant_value_quantized) {
+  return [data_def, pads_def, constant_value_def, attrs, has_constant_value, constant_value_quantized](ModelTestBuilder& builder,
+                                                                                                       std::vector<QuantParams<QuantType>>& output_qparams) {
+    std::vector<NodeArg*> inputs;
+    // data -> Q -> DQ ->
+    NodeArg* data = MakeTestInput(builder, data_def);
+    QuantParams<QuantType> data_qparams = GetTestInputQuantParams<QuantType>(data_def);
+    NodeArg* data_qdq = AddQDQNodePair<QuantType>(builder, data, data_qparams.scale, data_qparams.zero_point);
+    inputs.push_back(data_qdq);
+
+    // pads
+    NodeArg* pads = MakeTestInput(builder, pads_def);
+    inputs.push_back(pads);
+
+    // constant_value -- QNN support both quantized and non-quantized
+    if (has_constant_value) {
+      if (constant_value_quantized) {
+        // constant_value -> Q -> DQ ->
+        NodeArg* constant_value = MakeTestInput(builder, constant_value_def);
+        QuantParams<QuantType> constant_value_qparams = GetTestInputQuantParams<QuantType>(constant_value_def);
+        NodeArg* constant_value_qdq = AddQDQNodePair<QuantType>(builder, constant_value,
+                                                                constant_value_qparams.scale,
+                                                                constant_value_qparams.zero_point);
+        inputs.push_back(constant_value_qdq);
+      } else {
+        NodeArg* constant_value = MakeTestInput(builder, constant_value_def);
+        inputs.push_back(constant_value);
+      }
+    }
+
+    NodeArg* output = builder.MakeIntermediate();
+    Node& pad_node = builder.AddNode("Pad", inputs, {output});
+
+    for (const auto& attr : attrs) {
+      pad_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point);
+  };
+}
+
+// Runs an Pad model on the QNN CPU backend. Checks the graph node assignment, and that inference
+// outputs for QNN and CPU match.
+static void RunPadOpTest(const TestInputDef<float>& data_def,
+                         const TestInputDef<int64_t>& pads_def,
+                         const TestInputDef<float>& constant_value_def,
+                         const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                         ExpectedEPNodeAssignment expected_ep_assignment,
+                         bool has_constant_value = true,
+                         int opset = 18) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildPadTestCase(data_def, pads_def, constant_value_def, attrs, has_constant_value),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ Pad model on the QNN HTP backend. Checks the graph node assignment, and that inference
+// outputs for QNN and CPU match.
+template <typename QuantType>
+static void RunQDQPadOpTest(const TestInputDef<float>& data_def,
+                            const TestInputDef<int64_t>& pads_def,
+                            const TestInputDef<float>& constant_value_def,
+                            const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                            ExpectedEPNodeAssignment expected_ep_assignment,
+                            bool has_constant_value = true,
+                            bool constant_value_quantized = true,
+                            int opset = 18) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildPadTestCase(data_def, pads_def, constant_value_def, attrs),
+                       BuildPadQDQTestCase<QuantType>(data_def, pads_def, constant_value_def, attrs,
+                                                      has_constant_value, constant_value_quantized),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
+}
+
+//
+// CPU tests:
+//
+
+// Pad 2d
+TEST_F(QnnCPUBackendTests, Pad2d) {
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::All);
+}
+
+// Pad 2d, pads input not initializer
+TEST_F(QnnCPUBackendTests, Pad2dPadsNotIni) {
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, false, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::None);
+}
+
+// Pad reflect mode
+// Expected: contains 12 values, where each value and its corresponding value in 16-byte object <0C-00 00-00 00-00 00-00 40-01 23-05 EC-01 00-00> are an almost-equal pair
+// Actual: 16-byte object <0C-00 00-00 00-00 00-00 40-01 12-05 EC-01 00-00>, where the value pair (1.2, 0) at index #1 don't match, which is -1.2 from 1.2
+TEST_F(QnnCPUBackendTests, DISABLED_PadModeReflect) {
+  bool has_constant_value = false;
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "reflect")},
+               ExpectedEPNodeAssignment::All,
+               has_constant_value);
+}
+
+// Pad edge mode
+TEST_F(QnnCPUBackendTests, PadModeEdge) {
+  bool has_constant_value = false;
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "edge")},
+               ExpectedEPNodeAssignment::All,
+               has_constant_value);
+}
+
+// Pad wrap mode not supported
+TEST_F(QnnCPUBackendTests, PadModeWrap) {
+  bool has_constant_value = false;
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "wrap")},
+               ExpectedEPNodeAssignment::None,  // not supported
+               has_constant_value);
+}
+
+// Pad 4d
+TEST_F(QnnCPUBackendTests, Pad4d) {
+  RunPadOpTest(TestInputDef<float>({1, 2, 2, 2}, false,
+                                   {1.0f, 1.0f,
+                                    1.0f, 1.0f,
+                                    1.0f, 1.0f,
+                                    1.0f, 1.0f}),
+               TestInputDef<int64_t>({8}, true, {0, 0, 0, 1, 0, 0, 0, 1}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::All);
+}
+
+// Pad 5d supported
+TEST_F(QnnCPUBackendTests, Pad5d) {
+  RunPadOpTest(TestInputDef<float>({1, 2, 2, 2, 2}, false, GetFloatDataInRange(1.0f, 10.0f, 16)),
+               TestInputDef<int64_t>({10}, true, {0, 0, 0, 1, 0, 0, 0, 1, 0, 0}),
+               TestInputDef<float>({1}, true, {5.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::All);
+}
+
+// Pad 6d supported
+TEST_F(QnnCPUBackendTests, Pad6d) {
+  RunPadOpTest(TestInputDef<float>({1, 2, 2, 2, 2, 2}, false, GetFloatDataInRange(1.0f, 10.0f, 32)),
+               TestInputDef<int64_t>({12}, true, {0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::None);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+// QDQ Pad
+TEST_F(QnnHTPBackendTests, PadNoConstantValue) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input);
+}
+
+TEST_F(QnnHTPBackendTests, PadHasConstantValueNonQuantized) {
+  bool has_constant_value_input = true;
+  bool constant_value_quantized = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input,
+                           constant_value_quantized);
+}
+
+TEST_F(QnnHTPBackendTests, PadHasConstantValueQuantized) {
+  bool has_constant_value_input = true;
+  bool constant_value_quantized = true;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input,
+                           constant_value_quantized);
+}
+
+// QNN graph execute error. Error code: 6031
+TEST_F(QnnHTPBackendTests, DISABLED_PadReflectMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "reflect")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input);
+}
+
+TEST_F(QnnHTPBackendTests, PadEdgeMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "edge")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input);
+}
+
+// wrap mode not supported
+TEST_F(QnnHTPBackendTests, PadWrapMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "wrap")},
+                           ExpectedEPNodeAssignment::None,
+                           has_constant_value_input);
+}
+
+TEST_F(QnnHTPBackendTests, Pad4d) {
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false,
+                                               {1.0f, 2.0f,
+                                                3.0f, 4.0f,
+                                                5.0f, 6.0f,
+                                                7.0f, 8.0f}),
+                           TestInputDef<int64_t>({8}, true, {0, 0, 0, 1, 0, 0, 0, 1}),
+                           TestInputDef<float>({1}, true, {5.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All);
+}
+
+// Inaccuracy detected for output 'output', element 0.
+// Output quant params: scale=0.035294119268655777, zero_point=0.
+// Expected val: 9
+// QNN QDQ val: 8.0117654800415039 (err 0.98823451995849609)
+// CPU QDQ val: 9 (err 0)
+// QNN limitation? pad_constant_value has to be within the range of input[0].
+// Here pad_constant_value = 9.0 > max(input[0]) = 8.0
+TEST_F(QnnHTPBackendTests, DISABLED_Pad4dOutOfRangePadConstantValue) {
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false,
+                                               {1.0f, 2.0f,
+                                                3.0f, 4.0f,
+                                                5.0f, 6.0f,
+                                                7.0f, 8.0f}),
+                           TestInputDef<int64_t>({8}, true, {0, 0, 0, 1, 0, 0, 0, 1}),
+                           TestInputDef<float>({1}, true, {9.0f}),  // pad_constant_value out of input[0] range
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All);
+}
+
+// Pad 5d supported, but Quantize & Dequantize doesn't support 5d
+TEST_F(QnnHTPBackendTests, DISABLED_Pad5d) {
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2, 2}, false, GetFloatDataInRange(1.0f, 10.0f, 16)),
+                           TestInputDef<int64_t>({10}, true, {0, 0, 0, 1, 0, 0, 0, 1, 0, 0}),
+                           TestInputDef<float>({1}, true, {2.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
\ No newline at end of file

From 41d2ff622c49aa3628c04f6b64ed7f33c8d80f30 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Fri, 15 Sep 2023 08:03:18 +0800
Subject: [PATCH 26/34] [js/webgpu] Optimize InstanceNormalization (#17491)

### Description
<!-- Describe your changes. -->
In previous implementation, there are two loops to iterate H * W
elements to calculate the `mean` and `squaredNorm` value in one thread,
meanwhile it outputs H * W elements in one thread. That results it's
very very slow when H * W is a large value. And usually, H * W does be a
large value in a model. For example, in the `candy-8` model, the shapes
of [H, W] are [224,224], [112,112], [56,56] for `InstanceNormalization`
op. And in my ADL, `[1,224,224,32]` consumes 17 ms. See below:
```
[profiling] kernel "23848328|[InstanceNormalization] 23848328" input[0]: [1,224,224,32] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,224,224,32] | float32, execution time: 17007914 ns
```

In this PR, it uses workgroup memory to optimize the original algorithm.
The advantage is that it can parallelly utilize the 64 (workgroupSize)
threads in one workgroup to calculate `mean` and `squaredNorm` value.
Meanwhile, it only outputs `H * W / workgroupSize` outputs for one
thread, which greatly reduces the overhead for one thread. With this
optimization, `[1,224,224,32]` becomes 3 ms and the main overhead is the
extra two `transpose`. The `createInstanceNormProgramInfo` only needs
`0.64` ms. See below:
```
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,224,224,32] | float32, output[0]: [1,32,224,224] | float32, execution time: 1543792 ns
program-manager.ts:115
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, input[1]: [32] | float32, input[2]: [32] | float32, output[0]: [1,32,224,224] | float32, execution time: 642652 ns
program-manager.ts:115
[profiling] kernel "23003600|[InstanceNormalization] 23003600" input[0]: [1,32,224,224] | float32, output[0]: [1,224,224,32] | float32, execution time: 991608 ns
```
This PR currently only applies the new algorithm to NCHW format. For
NHWC format, one way is to transpose the input so that it can use the
new algorithm. But the disadvantage is that 2 extra transpose are added.
@dakenf also gives another way to optimize NHWC. Details see
[here](https://github.com/microsoft/onnxruntime/blob/d45a96616da9843b037210f2d48d6b4e5bdae5c6/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts).
I checked @dakenf's method. The perf is similar with transpose +
optimized NCHW. But on different GPUs, one is a little better than
another or vice versa. So I prefer this PR only does the NCHW part.
@dakenf can submit his optimization on NHWC.
---
 js/web/lib/wasm/jsep/webgpu/ops/common.ts     |   3 +-
 .../lib/wasm/jsep/webgpu/ops/instance-norm.ts | 116 ++++++++++--------
 js/web/test/data/ops/instance-norm.jsonc      |  79 ++++++++++++
 js/web/test/suite-test-list.jsonc             |   2 +
 4 files changed, 147 insertions(+), 53 deletions(-)
 create mode 100644 js/web/test/data/ops/instance-norm.jsonc

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index f3845e3110905..c054da51a3098 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -592,7 +592,8 @@ class ShaderHelperImpl implements ShaderHelper {
     const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];
 
     const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
-    const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3<u32>' :
+    const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(local_invocation_id) local_id : vec3<u32>` :
                                              `@builtin(local_invocation_index) local_index : u32,
     @builtin(workgroup_id) workgroup_id : vec3<u32>`;
     const globalIdxDefinition = is1DimensionDispatch ?
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index f62c766aa9ed0..449073a133295 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -1,83 +1,97 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common';
 
 export interface InstanceNormAttributes extends AttributeWithCacheKey {
   epsilon: number;
   format: 'NHWC'|'NCHW';
 }
 
-const validateInputs = (inputs: readonly TensorView[]): void => {
-  if (!inputs || inputs.length !== 3) {
-    throw new Error('instanceNorm requires 3 inputs.');
-  }
-
-  if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
-    throw new Error('inputs should be float type');
-  }
-};
-
 const createInstanceNormProgramInfo =
     (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
       const xShape = inputs[0].dims;
-      const scale = inputs[1];
-      const bias = inputs[2];
 
       const outputShape = xShape;
-      const outputSize = ShapeUtil.size(outputShape);
       const axis = 2;
       const normCount = ShapeUtil.sizeToDimension(xShape, axis);
       const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
       const C = xShape[1];
-
-      const scaleSize = ShapeUtil.size(scale.dims);
-      const biasSize = bias ? ShapeUtil.size(bias.dims) : 0;
-      if (scaleSize !== normSize || (bias && biasSize !== normSize)) {
-        throw new Error(`Size of X.shape()[axis:] == ${normSize}.
-             Size of scale and bias (if provided) must match this. 
-             Got scale size of ${scaleSize} and bias size of ${biasSize}`);
-      }
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-
+      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const variables = [x, scale, bias, output];
+      const dataType = x.type.value;
+      const workgroupSize = 64;
       const getShaderSource = (shaderHelper: ShaderHelper) => `
+
   const C: u32 = ${C};
   const normSize: u32 = ${normSize};
-  const normSizeTyped: ${dataType} = ${normSize};
   const epsilon: f32 = ${attributes.epsilon};
+  var<workgroup> meanShared : ${dataType};
+  var<workgroup> squaredNormShared : ${dataType};
+  var<workgroup> workgroupShared : array<${dataType}, ${workgroupSize}>;
+  const workgroupSize = ${workgroupSize}u;
+  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.mainStart(workgroupSize)}
+    let norm = global_idx / workgroupSize;
+    let batch = norm / C;
+    let channel = norm % C;
+    let localIndex = local_id.x;
+
+    // initialize workgroup memory
+    var initial: ${dataType} = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      initial = initial + ${x.get('batch', 'channel', 'h')};
+    }
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
 
-  @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> scale : array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias : array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> output : array<${dataType}>;
-
-  ${shaderHelper.mainStart()}
-    let offset = global_idx * normSize;
-    if (offset + normSize >= ${outputSize}) { return; }
-    var mean: ${dataType} = 0;
+    // Calculate the mean of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
+    }
+    if (localIndex == 0) {
+      meanShared = workgroupShared[0] / ${dataType}(normSize);
+    }
+    workgroupBarrier();
 
-    for (var h: u32 = 0u; h < normSize; h++) {
-        mean = mean + x[h + offset];
+    // reinitialize workgroup memory.
+    initial = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let deviation =  ${x.get('batch', 'channel', 'h')} - meanShared;
+      initial = initial + deviation * deviation;
     }
-    mean = mean / normSizeTyped;
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
 
-    var squaredNorm: ${dataType} = 0;
-    for (var h: u32 = 0u; h < normSize; h++) {
-        let deviation: f32 = x[h + offset] - mean;
-        squaredNorm = squaredNorm + deviation * deviation;
+    // Calculate the sum of square of deviation of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
     }
-    let invStdDev = 1 / sqrt(squaredNorm / normSizeTyped + epsilon);
-    let channelScale = invStdDev * scale[global_idx % C];
-    let channelShift = bias[global_idx % C] - mean * channelScale;
-    for (var j: u32 = 0; j < normSize; j++) {
-        output[j + offset] = x[j + offset] * channelScale + channelShift;
+    if (localIndex == 0) {
+      squaredNormShared = workgroupShared[0];
+    }
+    workgroupBarrier();
+
+    let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon);
+    let channelScale = invStdDev * ${scale.getByOffset('channel')};
+    let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift;
+      ${output.set('batch', 'channel', 'h', 'value')};
     }
   }`;
       return {
@@ -86,7 +100,7 @@ const createInstanceNormProgramInfo =
           {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
         ],
         getShaderSource,
-        dispatchGroup: () => ({x: Math.ceil(normCount / 64 /* workgroup size */)})
+        dispatchGroup: () => ({x: normCount})
       };
     };
 
@@ -118,7 +132,7 @@ const createInstanceNormNHWCProgramInfo =
   ${shaderHelper.mainStart()}
     let currentImageNumber = global_idx / C;
     let currentChannelNumber = global_idx % C;
-    
+
     // offset is channel num * N
     let offset = currentImageNumber * imageSize;
     if (offset >= ${outputSize}) { return; }
@@ -156,8 +170,6 @@ export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes):
     createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
 
 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
-  validateInputs(context.inputs);
-
   const metadata = {
     name: 'InstanceNormalization',
     inputTypes: [GpuDataType.default, GpuDataType.default, GpuDataType.default],
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
new file mode 100644
index 0000000000000..6a4e6912405ee
--- /dev/null
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -0,0 +1,79 @@
+[
+  {
+    "name": "Simple test with NHWC",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index e580259071968..94592884ccad6 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -257,6 +257,7 @@
       "greater.jsonc",
       //"identity.jsonc",
       "image-scaler.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
@@ -1347,6 +1348,7 @@
       "gemm.jsonc",
       "global-average-pool.jsonc",
       "greater.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",

From 5ed5f13920b66098cf3bfe7da85f463e6ff64bac Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Fri, 15 Sep 2023 09:47:45 +0900
Subject: [PATCH 27/34] [DML EP] Add missing member initializer
 DmlGraphNodeCreateInfo::nodeCount (#17505)

### Description
<!-- Describe your changes. -->

This adds a missing member initialization.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

It caused an access violation in
`Dml::GraphDescBuilder::BuildGraphDesc`.
---
 .../dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 232a022d869f4..04381b6ce355c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -80,7 +80,7 @@ namespace Windows::AI::MachineLearning::Adapter
     // Either nodesAsOperatorDesc or nodesAsIDMLOperator can have non-zero size.
     struct DmlGraphNodeCreateInfo
     {
-        uint32_t nodeCount;
+        uint32_t nodeCount = 0;
         std::vector<std::unique_ptr<AbstractOperatorDesc>> nodesAsOperatorDesc;
         std::vector<Microsoft::WRL::ComPtr<IDMLOperator>> nodesAsIDMLOperator;
         std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;

From 3a1e48dd5ad2a9ef95b6bc137441a4df6b05e5d0 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Thu, 14 Sep 2023 18:15:29 -0700
Subject: [PATCH 28/34] update BERT notebook with ORT 1.16 (#17524)

- Update BERT notebook with onnxruntime-gpu 1.16
- Add example of packing mode
- Run results in RTX 4090 GPU
---
 .../PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb  | 1433 +++++++++++------
 1 file changed, 956 insertions(+), 477 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 74b81fc7c867f..43c31e1ea45ac 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -33,19 +33,20 @@
     "\n",
     "#### GPU Environment Setup using AnaConda\n",
     "\n",
-    "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n",
+    "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 2.0.1 and OnnxRuntime 1.16.0.\n",
     "\n",
     "```console\n",
-    "conda create -n gpu_env python=3.6\n",
+    "conda create -n gpu_env python=3.10\n",
     "conda activate gpu_env\n",
-    "conda install -c anaconda ipykernel\n",
+    "pip install jupyterlab\n",
+    "conda install ipykernel\n",
     "conda install -c conda-forge ipywidgets\n",
-    "python -m ipykernel install --user --name=gpu_env\n",
-    "jupyter notebook\n",
+    "ipython kernel install --user --name gpu_env\n",
+    "jupyter-lab\n",
     "```\n",
     "Finally, launch Jupyter Notebook and you can choose gpu_env as kernel to run this notebook.\n",
     "\n",
-    "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here](https://onnxruntime.ai/docs/install/). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
+    "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
    ]
   },
   {
@@ -56,18 +57,19 @@
    "source": [
     "import sys\n",
     "\n",
-    "run_install = False # Only need install once\n",
-    "if run_install:\n",
-    "    if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
-    "        !{sys.executable} -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "    else: # Mac\n",
-    "        print(\"PyTorch 1.9 MacOS Binaries do not support CUDA, install from source instead\")\n",
-    "\n",
-    "    !{sys.executable} -m pip install onnxruntime-gpu==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
-    "\n",
-    "    # Install other packages used in this notebook.\n",
-    "    !{sys.executable} -m pip install transformers==4.8.2\n",
-    "    !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml coloredlogs wget netron sympy"
+    "if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
+    "    !{sys.executable} -m pip install torch --index-url https://download.pytorch.org/whl/cu118 -q\n",
+    "    !{sys.executable} -m pip install onnxruntime-gpu onnx transformers psutil pandas py-cpuinfo py3nvml coloredlogs wget netron sympy protobuf==3.20.3 -q\n",
+    "else: # Mac\n",
+    "    print(\"CUDA is not available on MacOS\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CUDA and cuDNN Path\n",
+    "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements) If you import torch before onnxruntime, onnxruntime might use the CUDA and cuDNN DLLs that loaded by PyTorch."
    ]
   },
   {
@@ -79,10 +81,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "pytorch: 1.9.0+cu111\n",
-      "onnxruntime: 1.8.1\n",
-      "onnx: 1.9.0\n",
-      "transformers: 4.8.2\n"
+      "pytorch: 2.0.1+cu118\n",
+      "onnxruntime: 1.16.0\n",
+      "onnx: 1.14.1\n",
+      "transformers: 4.33.1\n"
      ]
     }
    ],
@@ -191,9 +193,12 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 48/48 [00:03<00:00, 14.24it/s]\n",
-      "convert squad examples to features: 100%|██████████| 1000/1000 [00:08<00:00, 112.67it/s]\n",
-      "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 836518.55it/s]\n"
+      "Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
+      "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:02<00:00, 16.27it/s]\n",
+      "convert squad examples to features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 256.11it/s]\n",
+      "add example index and unique id: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]\n"
      ]
     }
    ],
@@ -242,24 +247,20 @@
    "execution_count": 7,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/disk/conda3/envs/gpu_env/lib/python3.6/site-packages/transformers/modeling_utils.py:1974: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Model exported at  ./onnx/bert-base-cased-squad_opset11.onnx\n"
+      "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n",
+      "verbose: False, log level: Level.ERROR\n",
+      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
+      "\n",
+      "Model exported at  .\\onnx_models\\bert-base-cased-squad_opset11.onnx\n"
      ]
     }
    ],
    "source": [
-    "output_dir = \"./onnx\"\n",
+    "output_dir = os.path.join(\".\", \"onnx_models\")\n",
     "if not os.path.exists(output_dir):\n",
     "    os.makedirs(output_dir)   \n",
     "export_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opset{}.onnx'.format(opset_version))\n",
@@ -318,7 +319,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorch cuda Inference time = 16.56 ms\n"
+      "PyTorch cuda Inference time = 19.32 ms\n"
      ]
     }
    ],
@@ -345,33 +346,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4. Inference ONNX Model with ONNX Runtime ##\n",
-    "\n",
-    "### CUDA and cuDNN Path\n",
-    "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n",
-    "add_cuda_path = False\n",
-    "\n",
-    "# For Linux, see https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#environment-setup\n",
-    "# Below is example for Windows\n",
-    "if add_cuda_path:\n",
-    "    cuda_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
-    "    cudnn_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
-    "    if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n",
-    "        raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n",
-    "    else:\n",
-    "        if cuda_dir == cudnn_dir:\n",
-    "            os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n",
-    "        else:\n",
-    "            os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]"
+    "## 4. Inference ONNX Model with ONNX Runtime ##"
    ]
   },
   {
@@ -383,14 +358,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "OnnxRuntime gpu Inference time = 25.28 ms\n"
+      "OnnxRuntime gpu Inference time = 6.91 ms\n"
      ]
     }
    ],
@@ -411,7 +386,7 @@
     "# Please change the value according to best setting in Performance Test Tool result.\n",
     "sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n",
     "\n",
-    "session = onnxruntime.InferenceSession(export_model_path, sess_options)\n",
+    "session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=[\"CUDAExecutionProvider\", \"CPUExecutionProvider\"])\n",
     "\n",
     "latency = []\n",
     "for i in range(total_samples):\n",
@@ -437,7 +412,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {
     "scrolled": true
    },
@@ -448,9 +423,9 @@
      "text": [
       "***** Verifying correctness *****\n",
       "PyTorch and ONNX Runtime output 0 are close: True\n",
-      "maximum_diff=5.7220458984375e-06 average_diff=1.3103708624839783e-06\n",
+      "maximum_diff=0.002086162567138672 average_diff=0.00040457770228385925\n",
       "PyTorch and ONNX Runtime output 1 are close: True\n",
-      "maximum_diff=5.7220458984375e-06 average_diff=1.2257369235157967e-06\n"
+      "maximum_diff=0.0033638477325439453 average_diff=0.00045418128138408065\n"
      ]
     }
    ],
@@ -476,7 +451,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {
     "scrolled": true
    },
@@ -512,7 +487,7 @@
        "          0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -531,7 +506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -539,7 +514,7 @@
      "output_type": "stream",
      "text": [
       "Average length 94\n",
-      "OnnxRuntime gpu Inference time with actual sequence length = 21.93 ms\n"
+      "OnnxRuntime gpu Inference time with actual sequence length = 6.47 ms\n"
      ]
     }
    ],
@@ -576,7 +551,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -584,8 +559,8 @@
      "output_type": "stream",
      "text": [
       "***** Comparing results with/without paddings *****\n",
-      "Output 0 are close: True\n",
-      "Output 1 are close: True\n"
+      "Output 0 are close: False\n",
+      "Output 1 are close: False\n"
      ]
     }
    ],
@@ -643,29 +618,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "               apply: Fused LayerNormalization count: 49\n",
-      "               apply: Fused Gelu count: 24\n",
-      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
-      "               apply: Fused SkipLayerNormalization count: 48\n",
-      "               apply: Fused Attention count: 24\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
-      "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
-      "               apply: Fused BiasGelu count: 24\n",
-      "               apply: Fused SkipLayerNormalization(add bias) count: 48\n",
+      "               apply: Fused LayerNormalization: 49\n",
+      "               apply: Fused Gelu: 24\n",
+      "               apply: Fused SkipLayerNormalization: 48\n",
+      "               apply: Fused Attention: 24\n",
+      "         prune_graph: Removed 5 nodes\n",
+      "               apply: Fused EmbedLayerNormalization(with mask): 1\n",
+      "         prune_graph: Removed 10 nodes\n",
+      "               apply: Fused BiasGelu: 24\n",
+      "               apply: Fused SkipLayerNormalization(add bias): 48\n",
       "            optimize: opset version: 11\n",
+      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'MultiHeadAttention': 0, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'GemmFastGelu': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 48, 'QOrderedAttention': 0, 'QOrderedGelu': 0, 'QOrderedLayerNormalization': 0, 'QOrderedMatMul': 0}\n",
+      "                main: The model has been fully optimized.\n",
       "  save_model_to_file: Sort graphs in topological order\n",
-      "  save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n",
-      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
-      "                main: The model has been fully optimized.\n"
+      "  save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n"
      ]
     }
    ],
@@ -690,7 +663,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -718,29 +691,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=32,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.15 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=24,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.73 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=15,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.51 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=14,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.49 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=13,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.24 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.20 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 310.02 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.22 ms, Throughput = 310.93 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.57 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.80 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 309.53 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 309.27 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.10 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 310.02 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.22 ms, Throughput = 310.30 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.10 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.19 ms, Throughput = 313.24 QPS\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=None, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 23.72 ms, Throughput = 42.15 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 24.24 ms, Throughput = 41.25 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 24.36 ms, Throughput = 41.05 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 24.39 ms, Throughput = 41.01 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
+      "Test summary is saved to onnx\\perf_results_GPU_B1_S128_20230912-125746.txt\n"
      ]
     }
    ],
    "source": [
-    "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
+    "GPU_OPTION = '--use_gpu --use_io_binding' if use_gpu else ''\n",
     "\n",
     "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
    ]
@@ -749,19 +748,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's load the summary file and take a look. Note that blank value in OMP_NUM_THREADS or OMP_WAIT_POLICY means the environment variable does not exist."
+    "Let's load the summary file and take a look."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
+      "Perf results from ./onnx\\perf_results_GPU_B1_S128_20230912-125746.txt\n"
      ]
     },
     {
@@ -798,89 +797,262 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>23.72</td>\n",
-       "      <td>23.72</td>\n",
-       "      <td>23.87</td>\n",
-       "      <td>23.99</td>\n",
-       "      <td>24.11</td>\n",
-       "      <td>24.37</td>\n",
-       "      <td>42.15</td>\n",
-       "      <td>4</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.16</td>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.52</td>\n",
+       "      <td>313.24</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>24.24</td>\n",
-       "      <td>24.24</td>\n",
-       "      <td>24.42</td>\n",
-       "      <td>24.60</td>\n",
-       "      <td>24.76</td>\n",
-       "      <td>25.23</td>\n",
-       "      <td>41.25</td>\n",
-       "      <td>3</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.17</td>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.34</td>\n",
+       "      <td>3.50</td>\n",
+       "      <td>312.80</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>24.36</td>\n",
-       "      <td>24.36</td>\n",
-       "      <td>24.47</td>\n",
-       "      <td>24.69</td>\n",
-       "      <td>25.01</td>\n",
-       "      <td>26.52</td>\n",
-       "      <td>41.05</td>\n",
-       "      <td>2</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.15</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.58</td>\n",
+       "      <td>312.51</td>\n",
+       "      <td>15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>24.39</td>\n",
-       "      <td>24.37</td>\n",
-       "      <td>24.47</td>\n",
-       "      <td>24.65</td>\n",
-       "      <td>24.73</td>\n",
-       "      <td>25.12</td>\n",
-       "      <td>41.01</td>\n",
-       "      <td>1</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.18</td>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.53</td>\n",
+       "      <td>312.49</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.16</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.40</td>\n",
+       "      <td>3.56</td>\n",
+       "      <td>312.24</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.48</td>\n",
+       "      <td>312.20</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.18</td>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.28</td>\n",
+       "      <td>3.37</td>\n",
+       "      <td>3.51</td>\n",
+       "      <td>311.73</td>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.34</td>\n",
+       "      <td>3.52</td>\n",
+       "      <td>311.57</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.18</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.31</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.54</td>\n",
+       "      <td>311.15</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.17</td>\n",
+       "      <td>3.24</td>\n",
+       "      <td>3.28</td>\n",
+       "      <td>3.34</td>\n",
+       "      <td>3.52</td>\n",
+       "      <td>311.10</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.33</td>\n",
+       "      <td>3.54</td>\n",
+       "      <td>311.10</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.51</td>\n",
+       "      <td>310.93</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.40</td>\n",
+       "      <td>3.55</td>\n",
+       "      <td>310.30</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.32</td>\n",
+       "      <td>3.42</td>\n",
+       "      <td>3.58</td>\n",
+       "      <td>310.02</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.30</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.54</td>\n",
+       "      <td>310.02</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.60</td>\n",
+       "      <td>309.53</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.33</td>\n",
+       "      <td>3.68</td>\n",
+       "      <td>309.27</td>\n",
+       "      <td>6</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0        23.72        23.72        23.87        23.99        24.11   \n",
-       "1        24.24        24.24        24.42        24.60        24.76   \n",
-       "2        24.36        24.36        24.47        24.69        25.01   \n",
-       "3        24.39        24.37        24.47        24.65        24.73   \n",
+       "    Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
+       "0          3.19         3.16         3.21         3.27         3.35   \n",
+       "1          3.20         3.17         3.22         3.25         3.34   \n",
+       "2          3.20         3.15         3.25         3.29         3.36   \n",
+       "3          3.20         3.18         3.21         3.26         3.35   \n",
+       "4          3.20         3.16         3.25         3.29         3.40   \n",
+       "5          3.20         3.19         3.22         3.27         3.35   \n",
+       "6          3.21         3.18         3.23         3.28         3.37   \n",
+       "7          3.21         3.19         3.23         3.27         3.34   \n",
+       "8          3.21         3.18         3.26         3.31         3.36   \n",
+       "9          3.21         3.17         3.24         3.28         3.34   \n",
+       "10         3.21         3.19         3.25         3.29         3.33   \n",
+       "11         3.22         3.19         3.25         3.29         3.36   \n",
+       "12         3.22         3.19         3.26         3.29         3.40   \n",
+       "13         3.23         3.19         3.26         3.32         3.42   \n",
+       "14         3.23         3.19         3.26         3.30         3.36   \n",
+       "15         3.23         3.20         3.23         3.27         3.35   \n",
+       "16         3.23         3.19         3.22         3.26         3.33   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
-       "0        24.37            42.15                     4  \n",
-       "1        25.23            41.25                     3  \n",
-       "2        26.52            41.05                     2  \n",
-       "3        25.12            41.01                     1  "
+       "    Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0          3.52           313.24                     1  \n",
+       "1          3.50           312.80                     8  \n",
+       "2          3.58           312.51                    15  \n",
+       "3          3.53           312.49                    14  \n",
+       "4          3.56           312.24                    13  \n",
+       "5          3.48           312.20                    12  \n",
+       "6          3.51           311.73                    24  \n",
+       "7          3.52           311.57                     9  \n",
+       "8          3.54           311.15                    32  \n",
+       "9          3.52           311.10                     5  \n",
+       "10         3.54           311.10                     2  \n",
+       "11         3.51           310.93                    10  \n",
+       "12         3.55           310.30                     3  \n",
+       "13         3.58           310.02                    11  \n",
+       "14         3.54           310.02                     4  \n",
+       "15         3.60           309.53                     7  \n",
+       "16         3.68           309.27                     6  "
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import os\n",
-    "import glob     \n",
-    "import pandas\n",
-    "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file)\n",
-    "print(\"Float32 model perf results from\", latest_result_file)\n",
-    "# Remove some columns that have same values for all rows.\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
-    "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
-    "result_data"
+    "def load_last_perf_test_result():\n",
+    "    import os\n",
+    "    import glob     \n",
+    "    import pandas\n",
+    "    latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n",
+    "    result_data = pandas.read_table(latest_result_file)\n",
+    "    print(\"Perf results from\", latest_result_file)\n",
+    "    # Do not show columns that have same values for all rows.\n",
+    "    columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'use_io_binding', 'average_sequence_length', 'random_sequence_length']\n",
+    "    result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
+    "    return result_data\n",
+    "    \n",
+    "thread_results = load_last_perf_test_result()\n",
+    "thread_results"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "From above result, we can see that latency is very close for different settings. The default setting (intra_op_num_threads=0, OMP_NUM_THREADS and OMP_WAIT_POLICY does not exist) performs the best. \n",
+    "From above result, we can see that latency is very close for different settings of intra_op_num_threads.\n",
     "\n",
     "### Model Results Comparison Tool\n",
     "\n",
@@ -891,21 +1063,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n",
-      "maximum absolute difference=5.316734313964844e-05\r\n",
-      "maximum relative difference=0.00012461667938623577\r\n"
+      "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\n",
+      "maximum absolute difference=0.05149984359741211\n"
      ]
     }
    ],
    "source": [
-    "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
+    "USE_GPU = '--use_gpu' if use_gpu else ''\n",
+    "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $USE_GPU"
    ]
   },
   {
@@ -916,80 +1088,106 @@
     "\n",
     "The optimizer.py script have an option **--float16** to convert model to use float16 to store weights. After the conversion, it could be faster to run in GPU with tensor cores like V100 or T4.\n",
     "\n",
-    "Let's run tools to measure the performance on V100. The results show significant performance improvement: latency is about 3.4 ms for float32 model, and 1.8 ms for float16 model."
+    "Let's run tools to measure the performance on Nvidia RTX 4090. The results show significant performance improvement: latency is about 3.2 ms for float32 model, and about 1.8 ms for float16 model."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "               apply: Fused LayerNormalization count: 49\n",
-      "               apply: Fused Gelu count: 24\n",
-      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
-      "               apply: Fused SkipLayerNormalization count: 48\n",
-      "               apply: Fused Attention count: 24\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
-      "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
-      "               apply: Fused BiasGelu count: 24\n",
-      "               apply: Fused SkipLayerNormalization(add bias) count: 48\n",
+      "\u001b\u0000[\u00000\u0000;\u00009\u00003\u0000m\u00002\u00000\u00002\u00003\u0000-\u00000\u00009\u0000-\u00001\u00002\u0000 \u00001\u00002\u0000:\u00005\u00007\u0000:\u00005\u00004\u0000.\u00005\u00005\u00000\u00008\u00002\u00002\u00008\u0000 \u0000[\u0000W\u0000:\u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000,\u0000 \u0000s\u0000e\u0000s\u0000s\u0000i\u0000o\u0000n\u0000_\u0000s\u0000t\u0000a\u0000t\u0000e\u0000.\u0000c\u0000c\u0000:\u00001\u00001\u00006\u00002\u0000 \u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000:\u0000V\u0000e\u0000r\u0000i\u0000f\u0000y\u0000E\u0000a\u0000c\u0000h\u0000N\u0000o\u0000d\u0000e\u0000I\u0000s\u0000A\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000T\u0000o\u0000A\u0000n\u0000E\u0000p\u0000]\u0000 \u0000S\u0000o\u0000m\u0000e\u0000 \u0000n\u0000o\u0000d\u0000e\u0000s\u0000 \u0000w\u0000e\u0000r\u0000e\u0000 \u0000n\u0000o\u0000t\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000 \u0000t\u0000o\u0000 \u0000t\u0000h\u0000e\u0000 \u0000p\u0000r\u0000e\u0000f\u0000e\u0000r\u0000r\u0000e\u0000d\u0000 \u0000e\u0000x\u0000e\u0000c\u0000u\u0000t\u0000i\u0000o\u0000n\u0000 \u0000p\u0000r\u0000o\u0000v\u0000i\u0000d\u0000e\u0000r\u0000s\u0000 \u0000w\u0000h\u0000i\u0000c\u0000h\u0000 \u0000m\u0000a\u0000y\u0000 \u0000o\u0000r\u0000 \u0000m\u0000a\u0000y\u0000 \u0000n\u0000o\u0000t\u0000 \u0000h\u0000a\u0000v\u0000e\u0000 \u0000a\u0000n\u0000 \u0000n\u0000e\u0000g\u0000a\u0000t\u0000i\u0000v\u0000e\u0000 \u0000i\u0000m\u0000p\u0000a\u0000c\u0000t\u0000 \u0000o\u0000n\u0000 \u0000p\u0000e\u0000r\u0000f\u0000o\u0000r\u0000m\u0000a\u0000n\u0000c\u0000e\u0000.\u0000 \u0000e\u0000.\u0000g\u0000.\u0000 \u0000O\u0000R\u0000T\u0000 \u0000e\u0000x\u0000p\u0000l\u0000i\u0000c\u0000i\u0000t\u0000l\u0000y\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000s\u0000 \u0000s\u0000h\u0000a\u0000p\u0000e\u0000 \u0000r\u0000e\u0000l\u0000a\u0000t\u0000e\u0000d\u0000 \u0000o\u0000p\u0000s\u0000 \u0000t\u0000o\u0000 \u0000C\u0000P\u0000U\u0000 \u0000t\u0000o\u0000 \u0000i\u0000m\u0000p\u0000r\u0000o\u0000v\u0000e\u0000 \u0000p\u0000e\u0000r\u0000f\u0000.\u0000\u001b\u0000[\u0000m\u0000\n",
+      "\u0000\u001b\u0000[\u00000\u0000;\u00009\u00003\u0000m\u00002\u00000\u00002\u00003\u0000-\u00000\u00009\u0000-\u00001\u00002\u0000 \u00001\u00002\u0000:\u00005\u00007\u0000:\u00005\u00004\u0000.\u00005\u00005\u00001\u00001\u00000\u00000\u00008\u0000 \u0000[\u0000W\u0000:\u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000,\u0000 \u0000s\u0000e\u0000s\u0000s\u0000i\u0000o\u0000n\u0000_\u0000s\u0000t\u0000a\u0000t\u0000e\u0000.\u0000c\u0000c\u0000:\u00001\u00001\u00006\u00004\u0000 \u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000:\u0000V\u0000e\u0000r\u0000i\u0000f\u0000y\u0000E\u0000a\u0000c\u0000h\u0000N\u0000o\u0000d\u0000e\u0000I\u0000s\u0000A\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000T\u0000o\u0000A\u0000n\u0000E\u0000p\u0000]\u0000 \u0000R\u0000e\u0000r\u0000u\u0000n\u0000n\u0000i\u0000n\u0000g\u0000 \u0000w\u0000i\u0000t\u0000h\u0000 \u0000v\u0000e\u0000r\u0000b\u0000o\u0000s\u0000e\u0000 \u0000o\u0000u\u0000t\u0000p\u0000u\u0000t\u0000 \u0000o\u0000n\u0000 \u0000a\u0000 \u0000n\u0000o\u0000n\u0000-\u0000m\u0000i\u0000n\u0000i\u0000m\u0000a\u0000l\u0000 \u0000b\u0000u\u0000i\u0000l\u0000d\u0000 \u0000w\u0000i\u0000l\u0000l\u0000 \u0000s\u0000h\u0000o\u0000w\u0000 \u0000n\u0000o\u0000d\u0000e\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000m\u0000e\u0000n\u0000t\u0000s\u0000.\u0000\u001b\u0000[\u0000m\u0000\n",
+      "\u0000               apply: Fused LayerNormalization: 49\n",
+      "               apply: Fused Gelu: 24\n",
+      "               apply: Fused SkipLayerNormalization: 48\n",
+      "               apply: Fused Attention: 24\n",
+      "         prune_graph: Removed 5 nodes\n",
+      "               apply: Fused EmbedLayerNormalization(with mask): 1\n",
+      "         prune_graph: Removed 10 nodes\n",
+      "               apply: Fused BiasGelu: 24\n",
+      "               apply: Fused SkipLayerNormalization(add bias): 48\n",
       "            optimize: opset version: 11\n",
+      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'MultiHeadAttention': 0, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'GemmFastGelu': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 48, 'QOrderedAttention': 0, 'QOrderedGelu': 0, 'QOrderedLayerNormalization': 0, 'QOrderedMatMul': 0}\n",
+      "                main: The model has been fully optimized.\n",
       "  save_model_to_file: Sort graphs in topological order\n",
-      "  save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n",
-      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
-      "                main: The model has been fully optimized.\n"
+      "  save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n"
      ]
     }
    ],
    "source": [
     "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n",
-    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16"
+    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16 $USE_GPU"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=32,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.45 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=24,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.74 ms, Throughput = 574.96 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=15,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.74 ms, Throughput = 574.28 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=14,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.74 ms, Throughput = 575.17 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=13,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.76 ms, Throughput = 569.77 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.79 ms, Throughput = 559.84 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.09 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 563.97 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 565.70 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 565.50 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.38 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.75 ms, Throughput = 572.89 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.76 ms, Throughput = 568.67 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.78 ms, Throughput = 561.98 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.14 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.78 ms, Throughput = 563.25 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 565.09 QPS\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=None, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.78 ms, Throughput = 147.54 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.76 ms, Throughput = 147.85 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.79 ms, Throughput = 147.30 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.81 ms, Throughput = 146.75 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
+      "Test summary is saved to onnx\\perf_results_GPU_B1_S128_20230912-130021.txt\n"
      ]
     }
    ],
    "source": [
-    "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
+    "GPU_OPTION = '--use_gpu --use_io_binding' if use_gpu else ''\n",
     "!python -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
+      "Perf results from ./onnx\\perf_results_GPU_B1_S128_20230912-130021.txt\n"
      ]
     },
     {
@@ -1026,82 +1224,243 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>6.76</td>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.81</td>\n",
-       "      <td>6.90</td>\n",
-       "      <td>6.91</td>\n",
-       "      <td>7.00</td>\n",
-       "      <td>147.85</td>\n",
-       "      <td>3</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.80</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>575.17</td>\n",
+       "      <td>14</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>6.78</td>\n",
-       "      <td>6.70</td>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.87</td>\n",
-       "      <td>6.90</td>\n",
-       "      <td>7.63</td>\n",
-       "      <td>147.54</td>\n",
-       "      <td>4</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>2.14</td>\n",
+       "      <td>574.96</td>\n",
+       "      <td>24</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.81</td>\n",
-       "      <td>6.89</td>\n",
-       "      <td>6.91</td>\n",
-       "      <td>7.19</td>\n",
-       "      <td>147.30</td>\n",
-       "      <td>2</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.79</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>574.28</td>\n",
+       "      <td>15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>6.81</td>\n",
-       "      <td>6.80</td>\n",
-       "      <td>6.89</td>\n",
-       "      <td>6.91</td>\n",
-       "      <td>6.97</td>\n",
-       "      <td>7.20</td>\n",
-       "      <td>146.75</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>2.02</td>\n",
+       "      <td>2.15</td>\n",
+       "      <td>572.89</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.14</td>\n",
+       "      <td>569.77</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.80</td>\n",
+       "      <td>2.08</td>\n",
+       "      <td>2.15</td>\n",
+       "      <td>568.67</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.12</td>\n",
+       "      <td>2.19</td>\n",
+       "      <td>566.45</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.77</td>\n",
+       "      <td>2.06</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>566.38</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.10</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>566.14</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.82</td>\n",
+       "      <td>2.07</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>566.09</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.78</td>\n",
+       "      <td>2.02</td>\n",
+       "      <td>2.13</td>\n",
+       "      <td>565.70</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.93</td>\n",
+       "      <td>2.06</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>565.50</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.11</td>\n",
+       "      <td>2.20</td>\n",
+       "      <td>565.09</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.85</td>\n",
+       "      <td>2.06</td>\n",
+       "      <td>2.15</td>\n",
+       "      <td>563.97</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>1.78</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.93</td>\n",
+       "      <td>2.13</td>\n",
+       "      <td>2.19</td>\n",
+       "      <td>563.25</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>1.78</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.88</td>\n",
+       "      <td>2.10</td>\n",
+       "      <td>2.19</td>\n",
+       "      <td>561.98</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>1.79</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.99</td>\n",
+       "      <td>2.08</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>559.84</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0         6.76         6.79         6.81         6.90         6.91   \n",
-       "1         6.78         6.70         6.79         6.87         6.90   \n",
-       "2         6.79         6.79         6.81         6.89         6.91   \n",
-       "3         6.81         6.80         6.89         6.91         6.97   \n",
+       "    Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
+       "0          1.74         1.72         1.72         1.75         1.80   \n",
+       "1          1.74         1.73         1.73         1.75         1.76   \n",
+       "2          1.74         1.72         1.73         1.76         1.79   \n",
+       "3          1.75         1.72         1.72         1.76         2.02   \n",
+       "4          1.76         1.74         1.74         1.76         1.81   \n",
+       "5          1.76         1.72         1.73         1.80         2.08   \n",
+       "6          1.77         1.73         1.74         1.81         2.12   \n",
+       "7          1.77         1.74         1.74         1.77         2.06   \n",
+       "8          1.77         1.73         1.74         1.81         2.10   \n",
+       "9          1.77         1.73         1.74         1.82         2.07   \n",
+       "10         1.77         1.74         1.75         1.78         2.02   \n",
+       "11         1.77         1.73         1.74         1.93         2.06   \n",
+       "12         1.77         1.73         1.74         1.81         2.11   \n",
+       "13         1.77         1.74         1.75         1.85         2.06   \n",
+       "14         1.78         1.73         1.74         1.93         2.13   \n",
+       "15         1.78         1.74         1.75         1.88         2.10   \n",
+       "16         1.79         1.75         1.76         1.99         2.08   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
-       "0         7.00           147.85                     3  \n",
-       "1         7.63           147.54                     4  \n",
-       "2         7.19           147.30                     2  \n",
-       "3         7.20           146.75                     1  "
+       "    Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0          2.17           575.17                    14  \n",
+       "1          2.14           574.96                    24  \n",
+       "2          2.16           574.28                    15  \n",
+       "3          2.15           572.89                     6  \n",
+       "4          2.14           569.77                    13  \n",
+       "5          2.15           568.67                     5  \n",
+       "6          2.19           566.45                    32  \n",
+       "7          2.17           566.38                     7  \n",
+       "8          2.18           566.14                     3  \n",
+       "9          2.17           566.09                    11  \n",
+       "10         2.13           565.70                     9  \n",
+       "11         2.16           565.50                     8  \n",
+       "12         2.20           565.09                     1  \n",
+       "13         2.15           563.97                    10  \n",
+       "14         2.19           563.25                     2  \n",
+       "15         2.19           561.98                     4  \n",
+       "16         2.16           559.84                    12  "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import os\n",
-    "import glob     \n",
-    "import pandas\n",
-    "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file)\n",
-    "print(\"Float32 model perf results from\", latest_result_file)\n",
-    "# Remove some columns that have same values for all rows.\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
-    "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
-    "result_data"
+    "fp32_result = load_last_perf_test_result()\n",
+    "fp32_result"
    ]
   },
   {
@@ -1117,59 +1476,265 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 20.41 ms, Throughput = 1567.65 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.73 ms, Throughput = 576.74 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 2.18 ms, Throughput = 917.92 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.25 ms, Throughput = 1229.91 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 5.38 ms, Throughput = 1486.89 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 9.90 ms, Throughput = 1616.79 QPS\n",
+      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=32 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 168.40 ms, Throughput = 190.02 QPS\n",
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 7.14 ms, Throughput = 140.00 QPS\n",
-      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=2 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 11.27 ms, Throughput = 177.41 QPS\n",
-      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=4 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 21.15 ms, Throughput = 189.09 QPS\n",
-      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=8 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 42.27 ms, Throughput = 189.27 QPS\n",
-      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=16 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 83.77 ms, Throughput = 191.01 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
+      "Test summary is saved to onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130248.txt\n"
      ]
     }
    ],
    "source": [
-    "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
-    "THREAD_SETTING = '--intra_op_num_threads 3'\n",
+    "THREAD_SETTING = '--intra_op_num_threads 8'\n",
     "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Perf results from ./onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130248.txt\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Latency(ms)</th>\n",
+       "      <th>Latency_P50</th>\n",
+       "      <th>Latency_P75</th>\n",
+       "      <th>Latency_P90</th>\n",
+       "      <th>Latency_P95</th>\n",
+       "      <th>Latency_P99</th>\n",
+       "      <th>Throughput(QPS)</th>\n",
+       "      <th>intra_op_num_threads</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.79</td>\n",
+       "      <td>2.04</td>\n",
+       "      <td>576.74</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.29</td>\n",
+       "      <td>2.76</td>\n",
+       "      <td>917.92</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.28</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.43</td>\n",
+       "      <td>1229.91</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>5.38</td>\n",
+       "      <td>5.38</td>\n",
+       "      <td>5.39</td>\n",
+       "      <td>5.42</td>\n",
+       "      <td>5.44</td>\n",
+       "      <td>5.60</td>\n",
+       "      <td>1486.89</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>9.90</td>\n",
+       "      <td>9.89</td>\n",
+       "      <td>9.94</td>\n",
+       "      <td>9.97</td>\n",
+       "      <td>10.00</td>\n",
+       "      <td>10.06</td>\n",
+       "      <td>1616.79</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>20.41</td>\n",
+       "      <td>20.41</td>\n",
+       "      <td>20.47</td>\n",
+       "      <td>20.52</td>\n",
+       "      <td>20.55</td>\n",
+       "      <td>20.68</td>\n",
+       "      <td>1567.65</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
+       "0         1.73         1.72         1.73         1.73         1.79   \n",
+       "1         2.18         2.16         2.16         2.18         2.29   \n",
+       "2         3.25         3.25         3.26         3.28         3.29   \n",
+       "3         5.38         5.38         5.39         5.42         5.44   \n",
+       "4         9.90         9.89         9.94         9.97        10.00   \n",
+       "5        20.41        20.41        20.47        20.52        20.55   \n",
+       "\n",
+       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0         2.04           576.74                     8  \n",
+       "1         2.76           917.92                     8  \n",
+       "2         3.43          1229.91                     8  \n",
+       "3         5.60          1486.89                     8  \n",
+       "4        10.06          1616.79                     8  \n",
+       "5        20.68          1567.65                     8  "
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fp16_result = load_last_perf_test_result()\n",
+    "fp16_result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Packing Mode (Effective Transformer)\n",
+    "\n",
+    "When padding ratio is high, it is helpful to use packing mode, also known as [effective transformer](https://github.com/bytedance/effective_transformer).\n",
+    "This feature requires onnxruntime-gpu verison 1.16 or later. \n",
+    "\n",
+    "In below example, average sequence length after removing paddings is 32, the sequence length with paddings is 128. We can see 3x throughput with packing mode (QPS increased from 1617 to 5652)."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 24,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "_replace_attention_with_packing_attention: Converted 24 Attention nodes to PackedAttention.\n",
+      "  save_model_to_file: Sort graphs in topological order\n",
+      "                save: Delete the existing onnx file: ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx\n",
+      "                save: Delete the existing external data file: ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx.data\n",
+      "  save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 5.66 ms, Throughput = 5652.40 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 1.70 ms, Throughput = 586.97 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 1.79 ms, Throughput = 1114.37 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 2262.31 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 2.18 ms, Throughput = 3666.45 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 3.31 ms, Throughput = 4829.58 QPS\n",
+      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=32 sequence_length=128\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=1 sequence_length=128\n",
+      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=2 sequence_length=128\n",
+      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=4 sequence_length=128\n",
+      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=8 sequence_length=128\n",
+      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=16 sequence_length=128\n",
+      "Test summary is saved to onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130354.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "assert use_gpu, \"Require GPU for packing mode\"\n",
+    "packed_fp16_model_path = './onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx'\n",
+    "!{sys.executable} -m onnxruntime.transformers.convert_to_packing_mode --input $optimized_fp16_model_path --output $packed_fp16_model_path --use_external_data_format\n",
+    "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $packed_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --average_sequence_length 32 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
+      "Perf results from ./onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130354.txt\n"
      ]
     },
     {
@@ -1200,75 +1765,75 @@
        "      <th>Latency_P95</th>\n",
        "      <th>Latency_P99</th>\n",
        "      <th>Throughput(QPS)</th>\n",
-       "      <th>batch_size</th>\n",
+       "      <th>intra_op_num_threads</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>7.14</td>\n",
-       "      <td>7.10</td>\n",
-       "      <td>7.13</td>\n",
-       "      <td>7.25</td>\n",
-       "      <td>7.35</td>\n",
-       "      <td>10.99</td>\n",
-       "      <td>140.00</td>\n",
-       "      <td>1</td>\n",
+       "      <td>1.70</td>\n",
+       "      <td>1.63</td>\n",
+       "      <td>1.65</td>\n",
+       "      <td>2.13</td>\n",
+       "      <td>2.20</td>\n",
+       "      <td>2.32</td>\n",
+       "      <td>586.97</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>11.27</td>\n",
-       "      <td>11.23</td>\n",
-       "      <td>11.28</td>\n",
-       "      <td>11.53</td>\n",
-       "      <td>11.57</td>\n",
-       "      <td>12.05</td>\n",
-       "      <td>177.41</td>\n",
-       "      <td>2</td>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.82</td>\n",
+       "      <td>1.93</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>2262.31</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>21.15</td>\n",
-       "      <td>21.13</td>\n",
-       "      <td>21.25</td>\n",
-       "      <td>21.44</td>\n",
-       "      <td>21.59</td>\n",
-       "      <td>22.07</td>\n",
-       "      <td>189.09</td>\n",
-       "      <td>4</td>\n",
+       "      <td>1.79</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>2.12</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.32</td>\n",
+       "      <td>1114.37</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>42.27</td>\n",
-       "      <td>42.26</td>\n",
-       "      <td>42.68</td>\n",
-       "      <td>42.95</td>\n",
-       "      <td>43.11</td>\n",
-       "      <td>45.11</td>\n",
-       "      <td>189.27</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>2.22</td>\n",
+       "      <td>2.30</td>\n",
+       "      <td>2.64</td>\n",
+       "      <td>3666.45</td>\n",
        "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>83.77</td>\n",
-       "      <td>83.84</td>\n",
-       "      <td>84.29</td>\n",
-       "      <td>84.94</td>\n",
-       "      <td>85.35</td>\n",
-       "      <td>86.34</td>\n",
-       "      <td>191.01</td>\n",
-       "      <td>16</td>\n",
+       "      <td>3.31</td>\n",
+       "      <td>3.31</td>\n",
+       "      <td>3.32</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.39</td>\n",
+       "      <td>3.51</td>\n",
+       "      <td>4829.58</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>168.40</td>\n",
-       "      <td>169.62</td>\n",
-       "      <td>170.78</td>\n",
-       "      <td>171.94</td>\n",
-       "      <td>172.82</td>\n",
-       "      <td>174.28</td>\n",
-       "      <td>190.02</td>\n",
-       "      <td>32</td>\n",
+       "      <td>5.66</td>\n",
+       "      <td>5.66</td>\n",
+       "      <td>5.68</td>\n",
+       "      <td>5.71</td>\n",
+       "      <td>5.74</td>\n",
+       "      <td>5.91</td>\n",
+       "      <td>5652.40</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1276,38 +1841,30 @@
       ],
       "text/plain": [
        "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0         7.14         7.10         7.13         7.25         7.35   \n",
-       "1        11.27        11.23        11.28        11.53        11.57   \n",
-       "2        21.15        21.13        21.25        21.44        21.59   \n",
-       "3        42.27        42.26        42.68        42.95        43.11   \n",
-       "4        83.77        83.84        84.29        84.94        85.35   \n",
-       "5       168.40       169.62       170.78       171.94       172.82   \n",
+       "0         1.70         1.63         1.65         2.13         2.20   \n",
+       "1         1.77         1.74         1.76         1.82         1.93   \n",
+       "2         1.79         1.73         1.74         2.12         2.18   \n",
+       "3         2.18         2.16         2.17         2.22         2.30   \n",
+       "4         3.31         3.31         3.32         3.35         3.39   \n",
+       "5         5.66         5.66         5.68         5.71         5.74   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  batch_size  \n",
-       "0        10.99           140.00           1  \n",
-       "1        12.05           177.41           2  \n",
-       "2        22.07           189.09           4  \n",
-       "3        45.11           189.27           8  \n",
-       "4        86.34           191.01          16  \n",
-       "5       174.28           190.02          32  "
+       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0         2.32           586.97                     8  \n",
+       "1         2.17          2262.31                     8  \n",
+       "2         2.32          1114.37                     8  \n",
+       "3         2.64          3666.45                     8  \n",
+       "4         3.51          4829.58                     8  \n",
+       "5         5.91          5652.40                     8  "
       ]
      },
-     "execution_count": 24,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import os\n",
-    "import glob     \n",
-    "import pandas\n",
-    "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file)\n",
-    "print(\"Float16 model summary from\", latest_result_file)\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'sequence_length']\n",
-    "columns_to_remove.extend(['intra_op_num_threads'])\n",
-    "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
-    "result_data"
+    "packing_result = load_last_perf_test_result()\n",
+    "packing_result"
    ]
   },
   {
@@ -1327,7 +1884,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
    "metadata": {
     "scrolled": true
    },
@@ -1336,131 +1893,53 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\r\n",
-      "  \"gpu\": {\r\n",
-      "    \"driver_version\": \"450.51.05\",\r\n",
-      "    \"devices\": [\r\n",
-      "      {\r\n",
-      "        \"memory_total\": 15843721216,\r\n",
-      "        \"memory_available\": 9313189888,\r\n",
-      "        \"name\": \"Tesla T4\"\r\n",
-      "      }\r\n",
-      "    ]\r\n",
-      "  },\r\n",
-      "  \"cpu\": {\r\n",
-      "    \"brand\": \"AMD EPYC 7V12 64-Core Processor\",\r\n",
-      "    \"cores\": 4,\r\n",
-      "    \"logical_cores\": 4,\r\n",
-      "    \"hz\": [\r\n",
-      "      2445417000,\r\n",
-      "      0\r\n",
-      "    ],\r\n",
-      "    \"l2_cache\": 524288,\r\n",
-      "    \"flags\": [\r\n",
-      "      \"3dnowext\",\r\n",
-      "      \"3dnowprefetch\",\r\n",
-      "      \"abm\",\r\n",
-      "      \"adx\",\r\n",
-      "      \"aes\",\r\n",
-      "      \"apic\",\r\n",
-      "      \"arat\",\r\n",
-      "      \"avx\",\r\n",
-      "      \"avx2\",\r\n",
-      "      \"bmi1\",\r\n",
-      "      \"bmi2\",\r\n",
-      "      \"clflush\",\r\n",
-      "      \"clflushopt\",\r\n",
-      "      \"clwb\",\r\n",
-      "      \"cmov\",\r\n",
-      "      \"cmp_legacy\",\r\n",
-      "      \"cpuid\",\r\n",
-      "      \"cr8_legacy\",\r\n",
-      "      \"cx16\",\r\n",
-      "      \"cx8\",\r\n",
-      "      \"de\",\r\n",
-      "      \"extd_apicid\",\r\n",
-      "      \"f16c\",\r\n",
-      "      \"fma\",\r\n",
-      "      \"fpu\",\r\n",
-      "      \"fsgsbase\",\r\n",
-      "      \"fxsr\",\r\n",
-      "      \"fxsr_opt\",\r\n",
-      "      \"ht\",\r\n",
-      "      \"hypervisor\",\r\n",
-      "      \"lahf_lm\",\r\n",
-      "      \"lm\",\r\n",
-      "      \"mca\",\r\n",
-      "      \"mce\",\r\n",
-      "      \"misalignsse\",\r\n",
-      "      \"mmx\",\r\n",
-      "      \"mmxext\",\r\n",
-      "      \"movbe\",\r\n",
-      "      \"msr\",\r\n",
-      "      \"mtrr\",\r\n",
-      "      \"nopl\",\r\n",
-      "      \"nx\",\r\n",
-      "      \"osvw\",\r\n",
-      "      \"osxsave\",\r\n",
-      "      \"pae\",\r\n",
-      "      \"pat\",\r\n",
-      "      \"pclmulqdq\",\r\n",
-      "      \"pdpe1gb\",\r\n",
-      "      \"pge\",\r\n",
-      "      \"pni\",\r\n",
-      "      \"popcnt\",\r\n",
-      "      \"pse\",\r\n",
-      "      \"pse36\",\r\n",
-      "      \"rdpid\",\r\n",
-      "      \"rdrand\",\r\n",
-      "      \"rdrnd\",\r\n",
-      "      \"rdseed\",\r\n",
-      "      \"rdtscp\",\r\n",
-      "      \"rep_good\",\r\n",
-      "      \"sep\",\r\n",
-      "      \"sha\",\r\n",
-      "      \"sha_ni\",\r\n",
-      "      \"smap\",\r\n",
-      "      \"smep\",\r\n",
-      "      \"ssbd\",\r\n",
-      "      \"sse\",\r\n",
-      "      \"sse2\",\r\n",
-      "      \"sse4_1\",\r\n",
-      "      \"sse4_2\",\r\n",
-      "      \"sse4a\",\r\n",
-      "      \"ssse3\",\r\n",
-      "      \"syscall\",\r\n",
-      "      \"topoext\",\r\n",
-      "      \"tsc\",\r\n",
-      "      \"umip\",\r\n",
-      "      \"vme\",\r\n",
-      "      \"vmmcall\",\r\n",
-      "      \"xgetbv1\",\r\n",
-      "      \"xsave\",\r\n",
-      "      \"xsavec\",\r\n",
-      "      \"xsaveerptr\",\r\n",
-      "      \"xsaveopt\",\r\n",
-      "      \"xsaves\"\r\n",
-      "    ],\r\n",
-      "    \"processor\": \"x86_64\"\r\n",
-      "  },\r\n",
-      "  \"memory\": {\r\n",
-      "    \"total\": 29450223616,\r\n",
-      "    \"available\": 22402334720\r\n",
-      "  },\r\n",
-      "  \"python\": \"3.6.13.final.0 (64 bit)\",\r\n",
-      "  \"os\": \"Linux-5.4.0-1046-azure-x86_64-with-debian-buster-sid\",\r\n",
-      "  \"onnxruntime\": {\r\n",
-      "    \"version\": \"1.8.1\",\r\n",
-      "    \"support_gpu\": true\r\n",
-      "  },\r\n",
-      "  \"onnxruntime_tools\": null,\r\n",
-      "  \"pytorch\": {\r\n",
-      "    \"version\": \"1.9.0+cu111\",\r\n",
-      "    \"support_gpu\": true,\r\n",
-      "    \"cuda\": \"11.1\"\r\n",
-      "  },\r\n",
-      "  \"tensorflow\": null\r\n",
-      "}\r\n"
+      "{\n",
+      "  \"gpu\": {\n",
+      "    \"driver_version\": \"537.13\",\n",
+      "    \"devices\": [\n",
+      "      {\n",
+      "        \"memory_total\": 25757220864,\n",
+      "        \"memory_available\": 18009264128,\n",
+      "        \"name\": \"NVIDIA GeForce RTX 4090\"\n",
+      "      }\n",
+      "    ]\n",
+      "  },\n",
+      "  \"cpu\": {\n",
+      "    \"brand\": \"13th Gen Intel(R) Core(TM) i9-13900\",\n",
+      "    \"cores\": 24,\n",
+      "    \"logical_cores\": 32,\n",
+      "    \"hz\": \"2000000000,0\",\n",
+      "    \"l2_cache\": 33554432,\n",
+      "    \"flags\": \"3dnow,3dnowprefetch,abm,acpi,adx,aes,apic,avx,avx2,bmi1,bmi2,clflush,clflushopt,clwb,cmov,cx16,cx8,de,dts,erms,est,f16c,fma,fpu,fxsr,gfni,ht,hypervisor,ia64,intel_pt,invpcid,lahf_lm,mca,mce,mmx,monitor,movbe,msr,mtrr,osxsave,pae,pat,pbe,pcid,pclmulqdq,pdcm,pge,pni,popcnt,pse,pse36,rdpid,rdrnd,rdseed,sep,serial,sha,smap,smep,ss,sse,sse2,sse4_1,sse4_2,ssse3,tm,tm2,tsc,tscdeadline,umip,vaes,vme,vpclmulqdq,x2apic,xsave,xtpr\",\n",
+      "    \"processor\": \"Intel64 Family 6 Model 183 Stepping 1, GenuineIntel\"\n",
+      "  },\n",
+      "  \"memory\": {\n",
+      "    \"total\": 33992912896,\n",
+      "    \"available\": 17272422400\n",
+      "  },\n",
+      "  \"os\": \"Windows-10-10.0.22621-SP0\",\n",
+      "  \"python\": \"3.10.13.final.0 (64 bit)\",\n",
+      "  \"packages\": {\n",
+      "    \"flatbuffers\": \"23.5.26\",\n",
+      "    \"numpy\": \"1.25.2\",\n",
+      "    \"onnx\": \"1.14.1\",\n",
+      "    \"onnxruntime-gpu\": \"1.16.0\",\n",
+      "    \"protobuf\": \"3.20.3\",\n",
+      "    \"sympy\": \"1.12\",\n",
+      "    \"torch\": \"2.0.1+cu118\",\n",
+      "    \"transformers\": \"4.33.1\"\n",
+      "  },\n",
+      "  \"onnxruntime\": {\n",
+      "    \"version\": \"1.16.0\",\n",
+      "    \"support_gpu\": true\n",
+      "  },\n",
+      "  \"pytorch\": {\n",
+      "    \"version\": \"2.0.1+cu118\",\n",
+      "    \"support_gpu\": true,\n",
+      "    \"cuda\": \"11.8\"\n",
+      "  },\n",
+      "  \"tensorflow\": null\n",
+      "}\n"
      ]
     }
    ],
@@ -1485,9 +1964,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.13"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }

From a2fba28f6cdc6a4ea946a9d56b4b4d19241ebe27 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Thu, 14 Sep 2023 20:43:24 -0700
Subject: [PATCH 29/34] Remove extraneous javascript includes (#17558)

---
 docs/c_cxx/doxygen-header.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/c_cxx/doxygen-header.html b/docs/c_cxx/doxygen-header.html
index 364f76f7f0580..6d95bf57ff98f 100644
--- a/docs/c_cxx/doxygen-header.html
+++ b/docs/c_cxx/doxygen-header.html
@@ -16,7 +16,7 @@
 <!--END DISABLE_INDEX-->
 <script type="text/javascript" src="$relpath^jquery.js"></script>
 <script type="text/javascript" src="$relpath^dynsections.js"></script>
-<script async src="https://www.googletagmanager.com/gtag/js?id=UA-156955408-1"></script><script type="text/javascript">"use strict"; window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'UA-156955408-1'); </script> <script type="text/javascript" src="/assets/js/vendor/lunr.min.js"></script> <script type="text/javascript" src="/assets/js/just-the-docs.js"></script>
+<script async src="https://www.googletagmanager.com/gtag/js?id=UA-156955408-1"></script><script type="text/javascript">"use strict"; window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'UA-156955408-1'); </script>
 $treeview
 $search
 $mathjax

From 9aafbe3febd569da2613230ce45b1ce1054488d5 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 14 Sep 2023 21:14:44 -0700
Subject: [PATCH 30/34] [js/web] revise TensorView (#17473)

### Description

This change:
- removes the unused `Tensor` types declared in
/js/web/lib/wasm/jsep/tensor.ts
- removes duplicated util functions in  /js/web/lib/wasm/jsep/tensor.ts
- renames /js/web/lib/wasm/jsep/**tensor.ts** to
/js/web/lib/wasm/jsep/**tensor-view.ts** and update corresponding
references. It was kind of confusing that we have multiple `Tensor`
types defined in different places also we have multiple `tensor.ts`
source files.

This is one of the prerequisites for supporting IO binding for WebGPU
buffer in onnxruntime-web.

list of prerequisites PRs:
https://github.com/microsoft/onnxruntime/pull/17465
https://github.com/microsoft/onnxruntime/pull/17469
https://github.com/microsoft/onnxruntime/pull/17470
https://github.com/microsoft/onnxruntime/pull/17472
https://github.com/microsoft/onnxruntime/pull/17473 (this one)
---
 js/web/lib/wasm/jsep/backend-webgpu.ts        |   2 +-
 js/web/lib/wasm/jsep/init.ts                  |   2 +-
 js/web/lib/wasm/jsep/tensor-view.ts           |  39 ++++++
 js/web/lib/wasm/jsep/tensor.ts                | 115 ------------------
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  |   2 +-
 .../ops/3rd-party/conv_backprop_webgpu.ts     |   2 +-
 .../ops/3rd-party/matmul_packed_webgpu.ts     |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts  |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts  |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/concat.ts     |   2 +-
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  |   2 +-
 .../wasm/jsep/webgpu/ops/conv-transpose.ts    |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts  |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/einsum.ts     |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/expand.ts     |   2 +-
 .../wasm/jsep/webgpu/ops/gather-elements.ts   |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts     |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts       |   2 +-
 .../lib/wasm/jsep/webgpu/ops/instance-norm.ts |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts     |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/pad.ts        |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/reduce.ts     |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/resize.ts     |   2 +-
 .../wasm/jsep/webgpu/ops/skip-layer-norm.ts   |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/slice.ts      |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/softmax.ts    |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/split.ts      |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/tile.ts       |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  |   2 +-
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   |   2 +-
 .../lib/wasm/jsep/webgpu/program-manager.ts   |   2 +-
 js/web/lib/wasm/jsep/webgpu/types.ts          |   3 +-
 35 files changed, 72 insertions(+), 149 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep/tensor-view.ts
 delete mode 100644 js/web/lib/wasm/jsep/tensor.ts

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index e6e78df2cfb23..5e77a0343b4ee 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -4,7 +4,7 @@
 import {Env} from 'onnxruntime-common';
 
 import {configureLogger, LOG_DEBUG} from './log';
-import {TensorView} from './tensor';
+import {TensorView} from './tensor-view';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 24ff79cfad3ee..78316cbe1c825 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -8,7 +8,7 @@ import {DataType, getTensorElementSize} from '../wasm-common';
 
 import {WebGpuBackend} from './backend-webgpu';
 import {LOG_DEBUG} from './log';
-import {TensorView} from './tensor';
+import {TensorView} from './tensor-view';
 import {ShapeUtil} from './util';
 import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
diff --git a/js/web/lib/wasm/jsep/tensor-view.ts b/js/web/lib/wasm/jsep/tensor-view.ts
new file mode 100644
index 0000000000000..69b9287f6de29
--- /dev/null
+++ b/js/web/lib/wasm/jsep/tensor-view.ts
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from 'onnxruntime-common';
+
+import {tensorTypeToTypedArrayConstructor} from '../wasm-common';
+
+export const createView = (dataBuffer: ArrayBuffer, type: Tensor.Type): Int32Array|Uint32Array|BigInt64Array|
+    BigUint64Array|Uint8Array|Float32Array|Float64Array|Int8Array|Int16Array|Uint16Array =>
+        new (tensorTypeToTypedArrayConstructor(type))(dataBuffer);
+
+/**
+ * a TensorView does not own the data.
+ */
+export interface TensorView {
+  readonly data: number;
+  readonly dataType: number;
+  readonly dims: readonly number[];
+
+  /**
+   * get a Float32Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getFloat32Array(): Float32Array;
+
+  /**
+   * get a BigInt64Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getBigInt64Array(): BigInt64Array;
+
+  /**
+   * get a Int32Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getInt32Array(): Int32Array;
+
+  /**
+   * create a new tensor view with the same data but different dimensions.
+   */
+  reshape(newDims: readonly number[]): TensorView;
+}
diff --git a/js/web/lib/wasm/jsep/tensor.ts b/js/web/lib/wasm/jsep/tensor.ts
deleted file mode 100644
index abe61e07fc0a8..0000000000000
--- a/js/web/lib/wasm/jsep/tensor.ts
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-export declare namespace Tensor {
-  export interface DataTypeMap {
-    bool: Uint8Array;
-    float32: Float32Array;
-    float64: Float64Array;
-    string: string[];
-    int8: Int8Array;
-    uint8: Uint8Array;
-    int16: Int16Array;
-    uint16: Uint16Array;
-    int32: Int32Array;
-    uint32: Uint32Array;
-    int64: BigInt64Array;
-    uint64: BigUint64Array;
-  }
-
-  export type DataType = keyof DataTypeMap;
-
-  export type StringType = Tensor.DataTypeMap['string'];
-  export type BooleanType = Tensor.DataTypeMap['bool'];
-  export type IntegerType = Tensor.DataTypeMap['int8']|Tensor.DataTypeMap['uint8']|Tensor.DataTypeMap['int16']|
-                            Tensor.DataTypeMap['uint16']|Tensor.DataTypeMap['int32']|Tensor.DataTypeMap['uint32']|
-                            Tensor.DataTypeMap['int64']|Tensor.DataTypeMap['uint64'];
-  export type FloatType = Tensor.DataTypeMap['float32']|Tensor.DataTypeMap['float64'];
-  export type NumberType = BooleanType|IntegerType|FloatType;
-
-  export type Id = number;
-}
-
-export const sizeof = (type: Tensor.DataType): number => {
-  switch (type) {
-    case 'bool':
-    case 'int8':
-    case 'uint8':
-      return 1;
-    case 'int16':
-    case 'uint16':
-      return 2;
-    case 'int32':
-    case 'uint32':
-    case 'float32':
-      return 4;
-    case 'int64':
-    case 'uint64':
-    case 'float64':
-      return 8;
-    default:
-      throw new Error(`cannot calculate sizeof() on type ${type}`);
-  }
-};
-
-const dataviewConstructor = (type: Tensor.DataType) => {
-  switch (type) {
-    case 'bool':
-    case 'uint8':
-      return Uint8Array;
-    case 'int8':
-      return Int8Array;
-    case 'int16':
-      return Int16Array;
-    case 'uint16':
-      return Uint16Array;
-    case 'int32':
-      return Int32Array;
-    case 'uint32':
-      return Uint32Array;
-    case 'int64':
-      return BigInt64Array;
-    case 'uint64':
-      return BigUint64Array;
-    case 'float32':
-      return Float32Array;
-    case 'float64':
-      return Float64Array;
-    default:
-      // should never run to here
-      throw new Error('unspecified error');
-  }
-};
-
-export const createView = (dataBuffer: ArrayBuffer, type: Tensor.DataType): Int32Array|Uint32Array|BigInt64Array|
-    BigUint64Array|Uint8Array|Float32Array|Float64Array|Int8Array|Int16Array|Uint16Array =>
-        new (dataviewConstructor(type))(dataBuffer);
-
-/**
- * a TensorView does not own the data.
- */
-export interface TensorView {
-  readonly data: number;
-  readonly dataType: number;
-  readonly dims: readonly number[];
-
-  /**
-   * get a Float32Array data view of the tensor data. tensor data must be on CPU.
-   */
-  getFloat32Array(): Float32Array;
-
-  /**
-   * get a BigInt64Array data view of the tensor data. tensor data must be on CPU.
-   */
-  getBigInt64Array(): BigInt64Array;
-
-  /**
-   * get a Int32Array data view of the tensor data. tensor data must be on CPU.
-   */
-  getInt32Array(): Int32Array;
-
-  /**
-   * create a new tensor view with the same data but different dimensions.
-   */
-  reshape(newDims: readonly number[]): TensorView;
-}
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 02507ad802b36..08b1d1f30b233 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -20,7 +20,7 @@
 // modified to fit the needs of the project
 
 import {LOG_DEBUG} from '../../../log';
-import {TensorView} from '../../../tensor';
+import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
 import {ConvAttributes} from '../conv';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 82fe3d5b6af43..ec6df438129fb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -18,7 +18,7 @@
 // sampled from [@tensorflow/tfjs] tfjs-backend-webgpu/src/conv_backprop_webgpu.ts
 
 import {LOG_DEBUG} from '../../../log';
-import {TensorView} from '../../../tensor';
+import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
 import {inputVariable, outputVariable, ShaderHelper} from '../common';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index ab4f608451101..8d43dbb378a69 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -19,7 +19,7 @@
 //
 // modified to fit the needs of the project
 
-import {TensorView} from '../../../tensor';
+import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
 import {getBroadcastDims, IndicesHelper, inputVariable, outputVariable, ShaderHelper} from '../common';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts b/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts
index 12a13d9d8e0a0..412e61a3cc0f9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts
@@ -6,7 +6,7 @@
 // a optimized codepath for this.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index b004ca37a2ea8..13d3a91bb339e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {BroadcastUtil, ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index 9b294803d3787..279632c190ded 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 8a794ce16a0b5..1b7b7e0b29a25 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
index acdfd7e40f258..e7d1ddf771650 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 3a83b1c2de6c1..95a64e5787841 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {PoolConvUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
index 0abece9559630..21c0b97042fbb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {GpuDataType, ProgramInfoLoader, ProgramMetadata} from '../types';
 
 import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
index f0196f37c3153..fc9ebf004ad25 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
index 2d845775f1c62..824ce682c0c4b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
index 57c5fccfd8c26..a7d355bc13704 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
index a915a4bbd969c..0db060dbec54a 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 3ce963b54f3ee..1a36d4a7545d6 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {GemmUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 449073a133295..5a148bda0a9f7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 8a9927b25a52e..d6a79e9460c3f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index e4dae00db6305..837ac8410f291 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {BroadcastUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfoLoader} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
index d90296b5c5a46..c2f89fd2845df 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 79071d32443d6..8c8c12fc54ddb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {PoolConvUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
index cb592c838dd97..0b8d03ea73b6b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
index 1d0b8229a76f7..8b9dbbf57ac75 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
index 4b845bcf2121b..7bfdd73b8af18 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
index 4211e526898e6..257b9ebc1fdac 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata, TensorInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
index e2443b24410a5..495a4bcea4f47 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
@@ -5,7 +5,7 @@
 // performance limitations when the reduced axis is long. Need to add
 // a optimized codepath for this.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/split.ts b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
index 9a150d21ea02e..3367091bbac23 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/split.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata, TensorInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/tile.ts b/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
index 99d9668757caa..109c29bfc8a80 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 9243b0e4af6b6..38dcaeab54c54 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index ef63d1177768c..7e52954734216 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {MAX_CLIP, MIN_CLIP, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index cce61be3448cd..cf2687e4c7382 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -4,7 +4,7 @@
 import {tensorDataTypeEnumToString} from '../../wasm-common';
 import {WebGpuBackend} from '../backend-webgpu';
 import {LOG_DEBUG} from '../log';
-import {TensorView} from '../tensor';
+import {TensorView} from '../tensor-view';
 
 import {createShaderHelper} from './ops/common';
 import {Artifact, GpuData, ProgramInfo} from './types';
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index ddbb9afc275f2..78f80b89774e2 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Tensor, TensorView} from '../tensor';
+import {TensorView} from '../tensor-view';
 
 import {ShaderHelper} from './ops/common';
 
@@ -19,7 +19,6 @@ export interface GpuData {
 }
 
 export interface TensorInfo {
-  id?: Tensor.Id;
   dims: readonly number[];
   dataType: number;
   gpuDataType: GpuDataType;

From 94f2ed6bbd95d0561eaad7c7a42ee3f56eb9f29e Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 14 Sep 2023 21:15:10 -0700
Subject: [PATCH 31/34] run_CIs_for_external_pr.py: update required pipelines
 (#17557)

### Description
Add required pipeline "Windows x64 QNN CI Pipeline" to script
"run_CIs_for_external_pr.py"
---
 tools/python/run_CIs_for_external_pr.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index beee4efc74c30..dcc6a92d84ef2 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -71,6 +71,7 @@ def main():
     pipelines = [
         # windows
         "Windows ARM64 QNN CI Pipeline",
+        "Windows x64 QNN CI Pipeline",
         "Windows CPU CI Pipeline",
         "Windows GPU CI Pipeline",
         "Windows GPU TensorRT CI Pipeline",

From 4d931edd78370b18805bd908e92c28e50994f6af Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 15 Sep 2023 08:20:47 -0700
Subject: [PATCH 32/34] Update tensorrt_dependencies in setup.py (#17562)

### Description
The files should not have the minor version number. The names were added
in #17365 by mistake.

### Motivation and Context
We did not successfully exclude them out.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 13731eb4e76bb..7e6ab93194b0d 100644
--- a/setup.py
+++ b/setup.py
@@ -214,7 +214,7 @@ def run(self):
                     "libhsa-runtime64.so.1",
                 ]
 
-                tensorrt_dependencies = ["libnvinfer.so.8.6", "libnvinfer_plugin.so.8.6", "libnvonnxparser.so.8.6"]
+                tensorrt_dependencies = ["libnvinfer.so.8", "libnvinfer_plugin.so.8", "libnvonnxparser.so.8"]
 
                 dest = "onnxruntime/capi/libonnxruntime_providers_openvino.so"
                 if path.isfile(dest):

From a5302fec93cd91d976e3798e38873927a2ddd8a9 Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Fri, 15 Sep 2023 23:29:48 +0800
Subject: [PATCH 33/34] [WebNN EP] Fix bug for PRelu on CPU backend. (#17543)

### Description
WebNN CPU backend expects slope of PRelu to be a static value. For now,
we will not support it.


### Motivation and Context
Fallback this case to pass the CI.
---
 .../webnn/builders/impl/binary_op_builder.cc  | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index 8e588aad15f4c..5adaf80543279 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -18,6 +18,10 @@ class BinaryOpBuilder : public BaseOpBuilder {
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  // Operator support related.
+  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -50,6 +54,24 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
+bool BinaryOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+                                        const Node& node,
+                                        const WebnnDeviceType device_type,
+                                        const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  const auto& op_type = node.OpType();
+
+  // XNNPACK prelu operator expects slope to be a static value.
+  // https://github.com/google/XNNPACK/issues/4692
+  // TODO: Remove this check after it is solved.
+  if (op_type == "PRelu" && !Contains(initializers, input_defs[1]->Name()) && device_type == WebnnDeviceType::CPU) {
+    LOGS(logger, VERBOSE) << "The second input (slope) for PRelu must be a constant initializer for WebNN CPU backend.";
+    return false;
+  }
+
+  return true;
+}
+
 void CreateBinaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   if (op_registrations.op_builder_map.count(op_type) > 0)
     return;

From 377f959c69e9f213cd4a8c71a5e80162a412989a Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 15 Sep 2023 23:35:55 +0800
Subject: [PATCH 34/34] Run Final_Jar_Testing_Linux_GPU in docker (#17533)

### Description
1. Create a package test image based on [RedHat
UBI](https://www.redhat.com/en/blog/introducing-red-hat-universal-base-image)
2. Install TensorRT 8.6.1.6 in RedHat. (Ref.
https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#maclearn-net-repo-install-rpm)
3. Run Final_Jar_Testing_Linux_GPU in docker (base image:
nvidia/cuda:11.8.0-cudnn8-devel-ubi8)

### Motivation and Context

[AB#18470](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/18470)

### Verification

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=354004&view=logs&j=8939b564-1402-57b5-92dc-510eba75e069&t=8939b564-1402-57b5-92dc-510eba75e069
---
 .../c-api-noopenmp-packaging-pipelines.yml    | 35 ++++++++++-----
 .../flex-downloadPipelineArtifact.yml         |  2 +-
 ...ckerfile.package_ubi8_cuda11_8_tensorrt8_6 | 45 +++++++++++++++++++
 .../linux/docker/scripts/install_java.sh      | 12 +++++
 4 files changed, 82 insertions(+), 12 deletions(-)
 create mode 100644 tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
 create mode 100755 tools/ci_build/github/linux/docker/scripts/install_java.sh

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 09b2a0697447e..fdd8c09333737 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -424,19 +424,32 @@ stages:
     - checkout: self
       submodules: false
     - template: templates/set-version-number-variables-step.yml
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Final Jar'
-      inputs:
-        buildType: 'current'
-        artifactName: 'onnxruntime-java-gpu'
-        targetPath: '$(Build.BinariesDirectory)/final-jar'
 
-    - task: Bash@3
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Final Jar'
+        ArtifactName: onnxruntime-java-gpu
+        TargetPath: '$(Build.BinariesDirectory)/final-jar'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+        Context: tools/ci_build/github/linux/docker/
+        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        Repository: onnxruntimeubi8packagestest
+        UpdateDepsTxt: false
+
+    - bash: |
+        docker run --rm \
+          --gpus all \
+          --volume $(Build.SourcesDirectory):/onnxruntime_src \
+          --volume $(Build.BinariesDirectory):/build \
+          --volume /data/models:/build/models:ro \
+          onnxruntimeubi8packagestest \
+          /bin/bash /onnxruntime_src/tools/ci_build/github/linux/java_linux_final_test.sh -r /build -v $(OnnxRuntimeVersion)
       displayName: 'Test'
-      inputs:
-        targetType: filePath
-        filePath: 'tools/ci_build/github/linux/java_linux_final_test.sh'
-        arguments: '-r $(Build.BinariesDirectory) -v $(OnnxRuntimeVersion)'
 
     - template: templates/component-governance-component-detection-steps.yml
       parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml b/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml
index 0f4e0553d05bf..a83451a1b33d9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml
@@ -18,7 +18,7 @@ parameters:
 
 steps:
   - task: DownloadPipelineArtifact@2
-    displayName: ${{ parameters.StepName }}}
+    displayName: ${{ parameters.StepName }}
     inputs:
       artifactName: ${{ parameters.ArtifactName}}
       targetPath: '${{ parameters.TargetPath }}'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
new file mode 100644
index 0000000000000..cdf504c8e3b03
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
@@ -0,0 +1,45 @@
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to Test ONNX Runtime on UBI8 with CUDA 11.8 and TensorRT 8.6
+
+# Build base image with required system packages
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8 AS base
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+
+RUN dnf install -y bash wget &&\
+    dnf clean dbcache
+
+# Install python3
+RUN dnf install -y \
+    python3.8 \
+    python38-pip \
+    python38-wheel &&\
+    cd /usr/local/bin &&\
+    ln -s /usr/bin/python3 python3.8 &&\
+    ln -s /usr/bin/pip3 pip3.8;
+
+RUN pip3 install --upgrade pip
+RUN pip3 install setuptools>=41.0.0
+
+# Install TensorRT
+RUN dnf install -y libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8
+RUN v="8.6.1.6-1+cuda11.8" &&\
+    dnf downgrade -y libnvinfer8-${v} libnvinfer8-${v} libnvonnxparsers8-${v} libnvparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-lean8-${v} libnvinfer-vc-plugin8-${v} libnvinfer-dispatch8-${v} &&\
+    dnf install -y dnf-plugin-versionlock &&\
+    dnf versionlock libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8
+RUN dnf clean dbcache
+
+
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && /tmp/scripts/install_java.sh && rm -rf /tmp/scripts
+
+# Build final image from base.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/scripts/install_java.sh b/tools/ci_build/github/linux/docker/scripts/install_java.sh
new file mode 100755
index 0000000000000..d11e29f693b8b
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/scripts/install_java.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e -x
+
+if [ -f /etc/redhat-release ]; then
+    dnf install -y java-11-openjdk-devel \
+    && dnf clean dbcache
+elif [ -f /etc/os-release ]; then
+    apt-get update && apt-get install -y openjdk-11-jdk
+else
+  echo "Unsupported OS"
+  exit 1
+fi

	Latency(ms)	Latency_P50	Latency_P75	Latency_P90	Latency_P95	Latency_P99	Throughput(QPS)	intra_op_num_threads
0	1.73	1.72	1.73	1.73	1.79	2.04	576.74	8
1	2.18	2.16	2.16	2.18	2.29	2.76	917.92	8
2	3.25	3.25	3.26	3.28	3.29	3.43	1229.91	8
3	5.38	5.38	5.39	5.42	5.44	5.60	1486.89	8
4	9.90	9.89	9.94	9.97	10.00	10.06	1616.79	8
5	20.41	20.41	20.47	20.52	20.55	20.68	1567.65	8