diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000..896912f3d14f7
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,33 @@
+---
+nav_exclude: true
+---
+## Developing
+
+Once you've installed dependencies with `npm install` (or  `yarn`), start a development server with hot-reload enabled:
+
+```bash
+npm run dev
+
+# or start the server and open the app in a new browser tab
+npm run dev -- --open
+```
+All working pages are in `src/routes/[page url]/+page.svelte`, which is where you can make your edits.
+
+### Technologies & relevant docs
+Please use the docs pages below to aid in your development process. As a general target, we should be using zero CSS, as daisyUI (framework with components) and tailwindcss (css classes) should be able to handle all of our styling needs.
+- [Svelte](https://svelte.dev/)
+- daisyUI [docs](https://daisyui.com/)
+- tailwindcss [docs](https://tailwindcss.com/docs)
+
+
+## Building
+
+To create a production version of your app:
+
+```bash
+npm run build
+```
+
+You can preview the production build with `npm run preview`.
+
+> To deploy your app, you may need to install an [adapter](https://kit.svelte.dev/docs/adapters) for your target environment.
diff --git a/README.md b/README.md
deleted file mode 100644
index e8bfc25c4e72a..0000000000000
--- a/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## Creating a project
-
-If you're seeing this, you've probably already done this step. Congrats!
-
-```bash
-# create a new project in the current directory
-npm create svelte@latest
-
-# create a new project in my-app
-npm create svelte@latest my-app
-```
-
-## Developing
-
-Once you've created a project and installed dependencies with `npm install` (or `pnpm install` or `yarn`), start a development server:
-
-```bash
-npm run dev
-
-# or start the server and open the app in a new browser tab
-npm run dev -- --open
-```
-
-## Building
-
-To create a production version of your app:
-
-```bash
-npm run build
-```
-
-You can preview the production build with `npm run preview`.
-
-> To deploy your app, you may need to install an [adapter](https://kit.svelte.dev/docs/adapters) for your target environment.
diff --git a/_config.yml b/_config.yml
index 0620f40ac8af4..3079a71fb07e6 100644
--- a/_config.yml
+++ b/_config.yml
@@ -35,3 +35,8 @@ gh_edit_repository: 'https://github.com/microsoft/onnxruntime' # the github URL
 gh_edit_branch: 'gh-pages' # the branch that your docs is served from
 # gh_edit_source: docs # the source that your files originate from
 gh_edit_view_mode: 'tree' # "tree" or "edit" if you want the user to jump into the editor immediately
+nav_external_links:
+  - title: ONNX Runtime Docs on GitHub
+    url: https://github.com/microsoft/onnxruntime/tree/gh-pages
+    hide_icon: true # set to true to hide the external link icon - defaults to false
+    opens_in_new_tab: true # set to true to open this link in a new tab - defaults to false
\ No newline at end of file
diff --git a/_includes/head_custom.html b/_includes/head_custom.html
new file mode 100644
index 0000000000000..1e8933448db3e
--- /dev/null
+++ b/_includes/head_custom.html
@@ -0,0 +1,32 @@
+<meta property="og:image" content="/images/logos/onnxruntime/ORT_icon_for_light_bg.png" />
+
+<script
+	type="text/javascript"
+	src="https://js.monitor.azure.com/scripts/c/ms.analytics-web-3.min.js"
+></script>
+
+<script type="text/javascript">
+	const config = {
+		instrumentationKey:
+			'360b0e675e0044398fd28c8bdf711b8e-1fe5434d-ee99-4837-99cc-a3a16462d82d-7262',
+		channelConfiguration: {
+			eventsLimitInMem: 50
+		},
+		propertyConfiguration: {
+			env: 'PROD'
+		},
+		webAnalyticsConfiguration: {
+			autoCapture: {
+				scroll: true,
+				pageView: true,
+				onLoad: true,
+				onUnload: true,
+				click: true,
+				resize: true,
+				jsError: true
+			}
+		}
+	};
+	const analytics = new oneDS.ApplicationInsights();
+	analytics.initialize(config, []);
+</script>
diff --git a/_includes/header_custom.html b/_includes/header_custom.html
deleted file mode 100644
index 98c5cd25fd08a..0000000000000
--- a/_includes/header_custom.html
+++ /dev/null
@@ -1,32 +0,0 @@
-<meta property="og:image" content="/images/logos/onnxruntime/ORT_icon_for_light_bg.png" />
-
-<script type="text/javascript" 
-  src="https://js.monitor.azure.com/scripts/c/ms.analytics-web-3.min.js">
-</script>
-
-<script type="text/javascript">
-  const analytics = new oneDS.ApplicationInsights();
-  var config = {
-    instrumentationKey: "360b0e675e0044398fd28c8bdf711b8e-1fe5434d-ee99-4837-99cc-a3a16462d82d-7262",
-    channelConfiguration:{ // Post channel configuration
-      eventsLimitInMem: 50
-    },
-    propertyConfiguration: { // Properties Plugin configuration 
-      env:"PROD" // Environment can be set to PPE or PROD as needed. 
-    },
-    webAnalyticsConfiguration:{ // Web Analytics Plugin configuration
-      //urlCollectQuery:true, 
-        autoCapture: {
-          scroll: true,
-          pageView: true,
-          onLoad: true,
-          onUnload: true,
-          click: true,
-          resize: true,
-          jsError: true
-        }
-    }
-  }; 
-  //Initialize SDK
-  analytics.initialize(config, []);
-</script>
diff --git a/docs/build/eps.md b/docs/build/eps.md
index b6138ecab1f2a..e3d43278b7c96 100644
--- a/docs/build/eps.md
+++ b/docs/build/eps.md
@@ -54,6 +54,11 @@ The onnxruntime code will look for the provider shared libraries in the same loc
 ### Build Instructions
 {: .no_toc }
 
+With an additional CMake argument the CUDA EP can be compiled with additional NHWC ops. 
+This option is not enabled by default due to the small amount of supported NHWC operators. 
+Over time more operators will be added but for now append `--cmake_extra_defines onnxruntime_USE_CUDA_NHWC_OPS=ON` to below build scripts to compile with NHWC operators.
+Another very helpful CMake build option is to build with NVTX support (`onnxruntime_ENABLE_NVTX_PROFILE=ON`) that will enable much easier profiling using [Nsight Systems](https://developer.nvidia.com/nsight-systems) and correlates CUDA kernels with their actual ONNX operator. 
+
 #### Windows
 
 ```
diff --git a/docs/build/inferencing.md b/docs/build/inferencing.md
index 340935eead419..9b24e7fb4b42f 100644
--- a/docs/build/inferencing.md
+++ b/docs/build/inferencing.md
@@ -84,7 +84,7 @@ When building on x86 Windows without  "--arm" or "--arm64" or "--arm64ec" args,
 By default, ONNX Runtime is configured to be built for a minimum target macOS version of 10.12.
 The shared library in the release Nuget(s) and the Python wheel may be installed on macOS versions of 10.12+.
 
-If you would like to use [Xcode](https://developer.apple.com/xcode/) to build the onnxruntime for x86_64 macOS, please add the --user_xcode argument in the command line.
+If you would like to use [Xcode](https://developer.apple.com/xcode/) to build the onnxruntime for x86_64 macOS, please add the `--use_xcode` argument in the command line.
 
 Without this flag, the cmake build generator will be Unix makefile by default.
 
diff --git a/docs/execution-providers/CUDA-ExecutionProvider.md b/docs/execution-providers/CUDA-ExecutionProvider.md
index 4f88aceb19566..f73df1ebf0909 100644
--- a/docs/execution-providers/CUDA-ExecutionProvider.md
+++ b/docs/execution-providers/CUDA-ExecutionProvider.md
@@ -24,7 +24,7 @@ Pre-built binaries of ONNX Runtime with CUDA EP are published for most language
 
 ## Requirements
 
-Please reference table below for official GPU packages dependencies for the ONNX Runtime inferencing package. Note that ONNX Runtime Training is aligned with PyTorch CUDA versions; refer to the Training tab on [onnxruntime.ai](https://onnxruntime.ai/) for supported versions. 
+Please reference table below for official GPU packages dependencies for the ONNX Runtime inferencing package. Note that ONNX Runtime Training is aligned with PyTorch CUDA versions; refer to the Training tab on [onnxruntime.ai](https://onnxruntime.ai/) for supported versions.
 
 Note: Because of CUDA Minor Version Compatibility, Onnx Runtime built with CUDA 11.4 should be compatible with any CUDA 11.x version.
 Please reference [Nvidia CUDA Minor Version Compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/#minor-version-compatibility).
@@ -57,10 +57,28 @@ The device ID.
 
 Default value: 0
 
+### user_compute_stream
+Defines the compute stream for the inference to run on.
+It implicitly sets the `has_user_compute_stream` option. It cannot be set through `UpdateCUDAProviderOptions`, but rather `UpdateCUDAProviderOptionsWithValue`.
+This cannot be used in combination with an external allocator.
+This can not be set using the python API.
+
+### do_copy_in_default_stream
+Whether to do copies in the default stream or use separate streams. The recommended setting is true. If false, there are race conditions and possibly better performance.
+
+Default value: true
+
+### use_ep_level_unified_stream
+Uses the same CUDA stream for all threads of the CUDA EP. This is implicitly enabled by `has_user_compute_stream`, `enable_cuda_graph` or when using an external allocator.
+
+Default value: false
+
 ### gpu_mem_limit
 The size limit of the device memory arena in bytes. This size limit is only for the execution provider's arena. The total device memory usage may be higher.
 s: max value of C++ size_t type (effectively unlimited)
 
+_Note:_ Will be over-ridden by contents of `default_memory_arena_cfg` (if specified)
+
 ### arena_extend_strategy
 The strategy for extending the device memory arena.
 
@@ -71,6 +89,8 @@ kSameAsRequested (1)    | extend by the requested amount
 
 Default value: kNextPowerOfTwo
 
+_Note:_ Will be over-ridden by contents of `default_memory_arena_cfg` (if specified)
+
 ### cudnn_conv_algo_search
 The type of search done for cuDNN convolution algorithms.
 
@@ -82,55 +102,78 @@ DEFAULT (2)     | default algorithm using CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PR
 
 Default value: EXHAUSTIVE
 
-### do_copy_in_default_stream
-Whether to do copies in the default stream or use separate streams. The recommended setting is true. If false, there are race conditions and possibly better performance.
-
-Default value: true
 
 ### cudnn_conv_use_max_workspace
 Check [tuning performance for convolution heavy models](#convolution-heavy-models) for details on what this flag does.
-This flag is only supported from the V2 version of the provider options struct when used using the C API. The V2 provider options struct can be created using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a0d29cbf555aa806c050748cf8d2dc172) and updated using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a4710fc51f75a4b9a75bde20acbfa0783). Please take a look at the sample below for an example.
+This flag is only supported from the V2 version of the provider options struct when used using the C API.(sample below)
 
 Default value: 1, for versions 1.14 and later
                0, for previous versions
 
 ### cudnn_conv1d_pad_to_nc1d
 Check [convolution input padding in the CUDA EP](#convolution-input-padding) for details on what this flag does.
-This flag is only supported from the V2 version of the provider options struct when used using the C API. The V2 provider options struct can be created using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a0d29cbf555aa806c050748cf8d2dc172) and updated using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a4710fc51f75a4b9a75bde20acbfa0783). Please take a look at the sample below for an example.
+This flag is only supported from the V2 version of the provider options struct when used using the C API. (sample below)
 
 Default value: 0
 
 ### enable_cuda_graph
 Check [using CUDA Graphs in the CUDA EP](#using-cuda-graphs-preview) for details on what this flag does.
-This flag is only supported from the V2 version of the provider options struct when used using the C API. The V2 provider options struct can be created using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a0d29cbf555aa806c050748cf8d2dc172) and updated using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a4710fc51f75a4b9a75bde20acbfa0783).
+This flag is only supported from the V2 version of the provider options struct when used using the C API. (sample below)
 
 Default value: 0
 
 ### enable_skip_layer_norm_strict_mode
-Whether to use strict mode in SkipLayerNormalization cuda implementation. The default and recommanded setting is false. If enabled, accuracy improvement and performance drop can be expected. 
-This flag is only supported from the V2 version of the provider options struct when used using the C API. The V2 provider options struct can be created using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a0d29cbf555aa806c050748cf8d2dc172) and updated using [this](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a4710fc51f75a4b9a75bde20acbfa0783).
+Whether to use strict mode in SkipLayerNormalization cuda implementation. The default and recommanded setting is false. If enabled, accuracy improvement and performance drop can be expected.
+This flag is only supported from the V2 version of the provider options struct when used using the C API. (sample below)
+
+Default value: 0
+
+### gpu_external_[alloc|free|empty_cache]
+
+gpu_external_* is used to pass external allocators.
+Example python usage:
+```python
+from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_gpu_allocator
+provider_option_map["gpu_external_alloc"] = str(torch_gpu_allocator.gpu_caching_allocator_raw_alloc_address())
+provider_option_map["gpu_external_free"] = str(torch_gpu_allocator.gpu_caching_allocator_raw_delete_address())
+provider_option_map["gpu_external_empty_cache"] = str(torch_gpu_allocator.gpu_caching_allocator_empty_cache_address())
+```
+
+Default value: 0
+
+### prefer_nhwc
+This option is not available in default builds ! One has to compile ONNX Runtime with `onnxruntime_USE_CUDA_NHWC_OPS=ON`.
+If this is enabled the EP prefers NHWC operators over NCHW. Needed transforms will be added to the model. As NVIDIA tensor cores can only work on NHWC layout this can increase performance if the model consists of many supported operators and does not need too many new transpose nodes. Wider operator support is planned in the future.
+This flag is only supported from the V2 version of the provider options struct when used using the C API. The V2 provider options struct can be created using [CreateCUDAProviderOptions](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a0d29cbf555aa806c050748cf8d2dc172) and updated using [UpdateCUDAProviderOptions](https://onnxruntime.ai/docs/api/c/struct_ort_api.html#a4710fc51f75a4b9a75bde20acbfa0783).
 
 Default value: 0
 
 ## Performance Tuning
-The [I/O Binding feature](../performance/tune-performance/iobinding.md) should be utilized to avoid overhead resulting from copies on inputs and outputs. 
+The [I/O Binding feature](../performance/tune-performance/iobinding.md) should be utilized to avoid overhead resulting from copies on inputs and outputs. Ideally up and downloads for inputs can be hidden behind the inference. This can be achieved by doing asynchronous copies while running inference. This is demonstrated in this [PR](https://github.com/microsoft/onnxruntime/pull/14088)
+```c++
+Ort::RunOptions run_options;
+run_options.AddConfigEntry("disable_synchronize_execution_providers", "1");
+session->Run(run_options, io_binding);
+```
+By disabling the synchronization on the inference the user has to take care of synchronizing the compute stream after execution.
+This feature should only be used with device local memory or an ORT Value allocated in [pinned memory](https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/), otherwise the issued download will be blocking and not behave as desired.
 
 ### Convolution-heavy models
 
-ORT leverages CuDNN for convolution operations and the first step in this process is to determine which "optimal" convolution algorithm to use while performing the convolution operation for the given input configuration (input shape, filter shape, etc.) in each `Conv` node. This sub-step involves querying CuDNN for a "workspace" memory size and have this allocated so that CuDNN can use this auxiliary memory while determining the "optimal" convolution algorithm to use. 
+ORT leverages CuDNN for convolution operations and the first step in this process is to determine which "optimal" convolution algorithm to use while performing the convolution operation for the given input configuration (input shape, filter shape, etc.) in each `Conv` node. This sub-step involves querying CuDNN for a "workspace" memory size and have this allocated so that CuDNN can use this auxiliary memory while determining the "optimal" convolution algorithm to use.
 
 The default value of `cudnn_conv_use_max_workspace` is 1 for versions 1.14 or later, and 0 for previous versions. When its value is 0, ORT clamps the workspace size to 32 MB which may lead to a sub-optimal convolution algorithm getting picked by CuDNN. To allow ORT to allocate the maximum possible workspace as determined by CuDNN, a provider option named `cudnn_conv_use_max_workspace` needs to get set (as shown below).
 
 Keep in mind that using this flag may increase the peak memory usage by a factor (sometimes a few GBs) but this does help CuDNN pick the best convolution algorithm for the given input. We have found that this is an important flag to use while using an fp16 model as this allows CuDNN to pick tensor core algorithms for the convolution operations (if the hardware supports tensor core operations). This flag may or may not result in performance gains for other data types (`float` and `double`).
 
-* Python    
+* Python
     ```python
     providers = [("CUDAExecutionProvider", {"cudnn_conv_use_max_workspace": '1'})]
     sess_options = ort.SessionOptions()
     sess = ort.InferenceSession("my_conv_heavy_fp16_model.onnx", sess_options=sess_options, providers=providers)
     ```
 
- 
+
 
 * C/C++
     ```c++
@@ -219,7 +262,7 @@ Currently, there are some constraints with regards to using the CUDA Graphs feat
 
 * Multi-threaded usage is currently not supported, i.e. `Run()` MAY NOT be invoked on the same `InferenceSession` object from multiple threads while using CUDA Graphs.
 
-NOTE: The very first `Run()` performs a variety of tasks under the hood like making CUDA memory allocations, capturing the CUDA graph for the model, and then performing a graph replay to ensure that the graph runs. Due to this, the latency associated with the first `Run()` is bound to be high. Subsequent `Run()`s only perform graph replays of the graph captured and cached in the first `Run()`. 
+NOTE: The very first `Run()` performs a variety of tasks under the hood like making CUDA memory allocations, capturing the CUDA graph for the model, and then performing a graph replay to ensure that the graph runs. Due to this, the latency associated with the first `Run()` is bound to be high. Subsequent `Run()`s only perform graph replays of the graph captured and cached in the first `Run()`.
 
 
 * Python
@@ -267,10 +310,10 @@ NOTE: The very first `Run()` performs a variety of tasks under the hood like mak
     void operator()(void* ptr) const {
         alloc_->Free(ptr);
     }
-    
+
     const Ort::Allocator* alloc_;
     };
-    
+
     // Enable cuda graph in cuda provider option.
     OrtCUDAProviderOptionsV2* cuda_options = nullptr;
     api.CreateCUDAProviderOptions(&cuda_options);
@@ -377,6 +420,10 @@ std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT"
 
 UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
 
+cudaStream_t cuda_stream;
+cudaStreamCreate(&cuda_stream);
+// this implicitly sets "has_user_compute_stream"
+UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", cuda_stream)
 OrtSessionOptions* session_options = /* ... */;
 SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options);
 
@@ -419,4 +466,3 @@ cudaProviderOptions.add("cudnn_conv1d_pad_to_nc1d","1");
 OrtSession.SessionOptions options = new OrtSession.SessionOptions(); // Must be closed after the session closes
 options.addCUDA(cudaProviderOptions);
 ```
-
diff --git a/docs/execution-providers/TensorRT-ExecutionProvider.md b/docs/execution-providers/TensorRT-ExecutionProvider.md
index 176cfc093117c..af6b673c78898 100644
--- a/docs/execution-providers/TensorRT-ExecutionProvider.md
+++ b/docs/execution-providers/TensorRT-ExecutionProvider.md
@@ -9,7 +9,7 @@ redirect_from: /docs/reference/execution-providers/TensorRT-ExecutionProvider
 # TensorRT Execution Provider
 {: .no_toc }
 
-With the TensorRT execution provider, the ONNX Runtime delivers better inferencing performance on the same hardware compared to generic GPU acceleration. 
+With the TensorRT execution provider, the ONNX Runtime delivers better inferencing performance on the same hardware compared to generic GPU acceleration.
 
 The TensorRT execution provider in the ONNX Runtime makes use of NVIDIA's [TensorRT](https://developer.nvidia.com/tensorrt) Deep Learning inferencing engine to accelerate ONNX model in their family of GPUs. Microsoft and NVIDIA worked closely to integrate the TensorRT execution provider with ONNX Runtime.
 
@@ -100,47 +100,55 @@ There are two ways to configure TensorRT settings, either by **TensorRT Executio
 | trt_profile_max_shapes                | ORT_TENSORRT_PROFILE_MAX_SHAPES                | string |
 | trt_profile_opt_shapes                | ORT_TENSORRT_PROFILE_OPT_SHAPES                | string |
 
-> Note: for bool type options, assign them with **True**/**False** in python, or **1**/**0** in C++. 
+> Note: for bool type options, assign them with **True**/**False** in python, or **1**/**0** in C++.
 
 ### Execution Provider Options
 
-TensorRT configurations can be set by execution provider options. It's useful when each model and inference session have their own configurations. In this case, execution provider option settings will override any environment variable settings. All configurations should be set explicitly, otherwise default value will be taken. 
+TensorRT configurations can be set by execution provider options. It's useful when each model and inference session have their own configurations. In this case, execution provider option settings will override any environment variable settings. All configurations should be set explicitly, otherwise default value will be taken.
 
-* `trt_max_workspace_size`: maximum workspace size for TensorRT engine. 
+* `device_id`: The device ID.
+    * Default value: 0
+
+* `user_compute_stream`: Defines the compute stream for the inference to run on. It implicitly sets the `has_user_compute_stream` option. It cannot be set through `UpdateTensorRTProviderOptions`, but rather `UpdateTensorRTProviderOptionsWithValue`.
+    * This cannot be used in combination with an external allocator.
+    * This can not be set using the python API.
+
+* `trt_max_workspace_size`: maximum workspace size for TensorRT engine.
     * Default value: 1073741824 (1GB).
 
-* `trt_max_partition_iterations`: maximum number of iterations allowed in model partitioning for TensorRT. 
-    * If target model can't be successfully partitioned when the maximum number of iterations is reached, the whole model will fall back to other execution providers such as CUDA or CPU. 
+* `trt_max_partition_iterations`: maximum number of iterations allowed in model partitioning for TensorRT.
+    * If target model can't be successfully partitioned when the maximum number of iterations is reached, the whole model will fall back to other execution providers such as CUDA or CPU.
     * Default value: 1000.
 
-* `trt_min_subgraph_size`: minimum node size in a subgraph after partitioning. 
-  * Subgraphs with smaller size will fall back to other execution providers. 
+* `trt_min_subgraph_size`: minimum node size in a subgraph after partitioning.
+  * Subgraphs with smaller size will fall back to other execution providers.
   * Default value: 1.
 
-* `trt_fp16_enable`: Enable FP16 mode in TensorRT. 
+* `trt_fp16_enable`: Enable FP16 mode in TensorRT.
     > Note: not all Nvidia GPUs support FP16 precision.
 
-* `trt_int8_enable`: Enable INT8 mode in TensorRT. 
+* `trt_int8_enable`: Enable INT8 mode in TensorRT.
     > Note:  not all Nvidia GPUs support INT8 precision.
 
 * `trt_int8_calibration_table_name`: Specify INT8 calibration table file for non-QDQ models in INT8 mode.
     > Note: calibration table should not be provided for QDQ model because TensorRT doesn't allow calibration table to be loded if there is any Q/DQ node in the model. By default the name is empty.
 
-* `trt_int8_use_native_calibration_table`: Select what calibration table is used for non-QDQ models in INT8 mode. 
-  * If `True`, native TensorRT generated calibration table is used; 
+* `trt_int8_use_native_calibration_table`: Select what calibration table is used for non-QDQ models in INT8 mode.
+  * If `True`, native TensorRT generated calibration table is used;
   * If `False`, ONNXRUNTIME tool generated calibration table is used.
   > Note: Please copy up-to-date calibration table file to `trt_engine_cache_path` before inference. Calibration table is specific to models and calibration data sets. Whenever new calibration table is generated, old file in the path should be cleaned up or be replaced.
 
-* `trt_dla_enable`: Enable DLA (Deep Learning Accelerator). 
+* `trt_dla_enable`: Enable DLA (Deep Learning Accelerator).
     > Note: Not all Nvidia GPUs support DLA.
 
 * `trt_dla_core`: Specify DLA core to execute on. Default value: 0.
 
-* `trt_engine_cache_enable`: Enable TensorRT engine caching. 
 
-    * The purpose of using engine caching is to save engine build time in the case that TensorRT may take long time to optimize and build engine. 
+* `trt_engine_cache_enable`: Enable TensorRT engine caching.
+
+    * The purpose of using engine caching is to save engine build time in the case that TensorRT may take long time to optimize and build engine.
 
-    * Engine will be cached when it's built for the first time so next time when new inference session is created the engine can be loaded directly from cache. In order to validate that the loaded engine is usable for current inference, engine profile is also cached and loaded along with engine. If current input shapes are in the range of the engine profile, the loaded engine can be safely used. Otherwise if input shapes are out of range, profile cache will be updated to cover the new shape and engine will be recreated based on the new profile (and also refreshed in the engine cache). 
+    * Engine will be cached when it's built for the first time so next time when new inference session is created the engine can be loaded directly from cache. In order to validate that the loaded engine is usable for current inference, engine profile is also cached and loaded along with engine. If current input shapes are in the range of the engine profile, the loaded engine can be safely used. Otherwise if input shapes are out of range, profile cache will be updated to cover the new shape and engine will be recreated based on the new profile (and also refreshed in the engine cache).
 
         * Note each engine is created for specific settings such as model path/name, precision (FP32/FP16/INT8 etc), workspace, profiles etc, and specific GPUs and it's not portable, so it's essential to make sure those settings are not changing, otherwise the engine needs to be rebuilt and cached again.
 
@@ -152,7 +160,7 @@ TensorRT configurations can be set by execution provider options. It's useful wh
 
 * `trt_engine_cache_path`: Specify path for TensorRT engine and profile files if `trt_engine_cache_enable` is `True`, or path for INT8 calibration table file if `trt_int8_enable` is `True`.
 
-* `trt_dump_subgraphs`: Dumps the subgraphs that are transformed into TRT engines in onnx format to the filesystem. 
+* `trt_dump_subgraphs`: Dumps the subgraphs that are transformed into TRT engines in onnx format to the filesystem.
   * This can help debugging subgraphs, e.g. by using  `trtexec --onnx my_model.onnx` and check the outputs of the parser.
 
 * `trt_force_sequential_engine_build`: Sequentially build TensorRT engines across provider instances in multi-GPU environment.
@@ -161,37 +169,38 @@ TensorRT configurations can be set by execution provider options. It's useful wh
 
 * `trt_layer_norm_fp32_fallback`: Force Pow + Reduce ops in layer norm to FP32.
 
-* `trt_timing_cache_enable`: Enable TensorRT timing cache. 
+* `trt_timing_cache_enable`: Enable TensorRT timing cache.
   * Check [Timing cache](#timing-cache) for details.
 
 * `trt_force_timing_cache`: Force the TensorRT timing cache to be used even if device profile does not match.
+    * A perfect match is only the exact same GPU model as the on that produced the timing cache.
 
 * `trt_detailed_build_log`: Enable detailed build step logging on TensorRT EP with timing for each engine build.
 
 * `trt_build_heuristics_enable`: Build engine using heuristics to reduce build time.
 
-* `trt_sparsity_enable`: Control if sparsity can be used by TRT. 
+* `trt_sparsity_enable`: Control if sparsity can be used by TRT.
   * Check `--sparsity` in `trtexec` command-line flags for [details](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#trtexec-flags).
 
-* `trt_builder_optimization_level`: Set the builder optimization level. 
+* `trt_builder_optimization_level`: Set the builder optimization level.
   > WARNING: levels below 3 do not guarantee good engine performance, but greatly improve build time.  Default 3, valid range [0-5]. Check `--builderOptimizationLevel` in `trtexec` command-line flags for [details](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#trtexec-flags).
 
-* `trt_auxiliary_streams`: Set maximum number of auxiliary streams per inference stream. 
-  * Setting this value to 0 will lead to optimal memory usage. 
-  * Default -1 = heuristics. 
+* `trt_auxiliary_streams`: Set maximum number of auxiliary streams per inference stream.
+  * Setting this value to 0 will lead to optimal memory usage.
+  * Default -1 = heuristics.
   * Check `--maxAuxStreams` in `trtexec` command-line flags for [details](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#trtexec-flags).
 
-* `trt_tactic_sources`: Specify the tactics to be used by adding (+) or removing (-) tactics from the default tactic sources (default = all available tactics) 
+* `trt_tactic_sources`: Specify the tactics to be used by adding (+) or removing (-) tactics from the default tactic sources (default = all available tactics)
   * e.g. "-CUDNN,+CUBLAS" available keys: "CUBLAS", "CUBLAS_LT", "CUDNN" or "EDGE_MASK_CONVOLUTIONS".
 
-* `trt_extra_plugin_lib_paths`: Specify extra TensorRT plugin library paths. 
-  * ORT TRT by default supports any TRT plugins registered in TRT registry in TRT plugin library (i.e., `libnvinfer_plugin.so`). 
-  * Moreover, if users want to use other TRT plugins that are not in TRT plugin library, 
-    * for example, FasterTransformer has many TRT plugin implementations for different models, user can specify like this `ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS=libvit_plugin.so;libvit_int8_plugin.so`.  
+* `trt_extra_plugin_lib_paths`: Specify extra TensorRT plugin library paths.
+  * ORT TRT by default supports any TRT plugins registered in TRT registry in TRT plugin library (i.e., `libnvinfer_plugin.so`).
+  * Moreover, if users want to use other TRT plugins that are not in TRT plugin library,
+    * for example, FasterTransformer has many TRT plugin implementations for different models, user can specify like this `ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS=libvit_plugin.so;libvit_int8_plugin.so`.
 
-* `trt_profile_min_shapes`, `trt_profile_max_shapes` and `trt_profile_opt_shapes` : Build with dynamic shapes using a profile with the min/max/opt shapes provided. 
-  * The format of the profile shapes is `input_tensor_1:dim_1xdim_2x...,input_tensor_2:dim_3xdim_4x...,...` 
-    * These three flags should all be provided in order to enable explicit profile shapes feature. 
+* `trt_profile_min_shapes`, `trt_profile_max_shapes` and `trt_profile_opt_shapes` : Build with dynamic shapes using a profile with the min/max/opt shapes provided.
+  * The format of the profile shapes is `input_tensor_1:dim_1xdim_2x...,input_tensor_2:dim_3xdim_4x...,...`
+    * These three flags should all be provided in order to enable explicit profile shapes feature.
   * Check [Explicit shape range for dynamic shape input](#explicit-shape-range-for-dynamic-shape-input) and TRT doc [optimization profiles](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#opt_profiles) for more details.
 
 
@@ -203,20 +212,58 @@ Besides, `device_id` can also be set by execution provider option.
 
 ```c++
 Ort::SessionOptions session_options;
-OrtTensorRTProviderOptions trt_options{};
-
-// note: for bool type options in c++ API, set them as 0/1
-trt_options.device_id = 1;
-trt_options.trt_max_workspace_size = 2147483648;
-trt_options.trt_max_partition_iterations = 10;
-trt_options.trt_min_subgraph_size = 5;
-trt_options.trt_fp16_enable = 1;
-trt_options.trt_int8_enable = 1;
-trt_options.trt_int8_use_native_calibration_table = 1;
-trt_options.trt_engine_cache_enable = 1;
-trt_options.trt_engine_cache_path = "/path/to/cache"
-trt_options.trt_dump_subgraphs = 1;  
-session_options.AppendExecutionProvider_TensorRT(trt_options);
+
+const auto& api = Ort::GetApi();
+OrtTensorRTProviderOptionsV2* tensorrt_options;
+Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
+
+std::vector<const char*> option_keys = {
+    "device_id",
+    "trt_max_workspace_size",
+    "trt_max_partition_iterations",
+    "trt_min_subgraph_size",
+    "trt_fp16_enable",
+    "trt_int8_enable",
+    "trt_int8_use_native_calibration_table",
+    "trt_dump_subgraphs",
+    // below options are strongly recommended !
+    "trt_engine_cache_enable",
+    "trt_engine_cache_path",
+    "trt_timing_cache_enable",
+    "trt_timing_cache_path",
+};
+std::vector<const char*> option_values = {
+    "1",
+    "2147483648",
+    "10",
+    "5",
+    "1",
+    "1",
+    "1",
+    "1",
+    "1",
+    "1",
+    "/path/to/cache",
+    "1",
+    "/path/to/cache", // can be same as the engine cache folder
+};
+
+Ort::ThrowOnError(api.UpdateTensorRTProviderOptions(tensorrt_options,
+                                                    option_keys.data(), option_values.data(), option_keys.size()));
+
+
+cudaStream_t cuda_stream;
+cudaStreamCreate(&cuda_stream);
+// this implicitly sets "has_user_compute_stream"
+Ort::ThrowOnError(api.UpdateTensorRTProviderOptionsWithValue(cuda_options, "user_compute_stream", cuda_stream))
+
+session_options.AppendExecutionProvider_TensorRT_V2(tensorrt_options);
+/// below code can be used to print all options
+OrtAllocator* allocator;
+char* options;
+Ort::ThrowOnError(api.GetAllocatorWithDefaultOptions(&allocator));
+Ort::ThrowOnError(api.GetTensorRTProviderOptionsAsString(tensorrt_options,          allocator, &options));
+
 ```
 
 </details>
@@ -257,7 +304,7 @@ sess = ort.InferenceSession(model_path, sess_options=sess_opt, providers=provide
 Following environment variables can be set for TensorRT execution provider. Click below for more details.
 
 <details>
-  
+
 * `ORT_TENSORRT_MAX_WORKSPACE_SIZE`: maximum workspace size for TensorRT engine. Default value: 1073741824 (1GB).
 
 * `ORT_TENSORRT_MAX_PARTITION_ITERATIONS`: maximum number of iterations allowed in model partitioning for TensorRT. If target model can't be successfully partitioned when the maximum number of iterations is reached, the whole model will fall back to other execution providers such as CUDA or CPU. Default value: 1000.
@@ -273,7 +320,7 @@ Following environment variables can be set for TensorRT execution provider. Clic
 * `ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE`: Select what calibration table is used for non-QDQ models in INT8 mode. If 1, native TensorRT generated calibration table is used; if 0, ONNXRUNTIME tool generated calibration table is used. Default value: 0.
     * **Note: Please copy up-to-date calibration table file to `ORT_TENSORRT_CACHE_PATH` before inference. Calibration table is specific to models and calibration data sets. Whenever new calibration table is generated, old file in the path should be cleaned up or be replaced.**
 
-* `ORT_TENSORRT_DLA_ENABLE`: Enable DLA (Deep Learning Accelerator). 1: enabled, 0: disabled. Default value: 0. Note not all Nvidia GPUs support DLA. 
+* `ORT_TENSORRT_DLA_ENABLE`: Enable DLA (Deep Learning Accelerator). 1: enabled, 0: disabled. Default value: 0. Note not all Nvidia GPUs support DLA.
 
 * `ORT_TENSORRT_DLA_CORE`: Specify DLA core to execute on. Default value: 0.
 
@@ -310,7 +357,7 @@ Following environment variables can be set for TensorRT execution provider. Clic
 
 * `ORT_TENSORRT_TACTIC_SOURCES`: Specify the tactics to be used by adding (+) or removing (-) tactics from the default tactic sources (default = all available tactics) e.g. "-CUDNN,+CUBLAS" available keys: "CUBLAS", "CUBLAS_LT", "CUDNN" or "EDGE_MASK_CONVOLUTIONS".
 
-* `ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS`: Specify extra TensorRT plugin library paths. ORT TRT by default supports any TRT plugins registered in TRT registry in TRT plugin library (i.e., `libnvinfer_plugin.so`). Moreover, if users want to use other TRT plugins that are not in TRT plugin library, for example, FasterTransformer has many TRT plugin implementations for different models, user can specify like this `ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS=libvit_plugin.so;libvit_int8_plugin.so`.  
+* `ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS`: Specify extra TensorRT plugin library paths. ORT TRT by default supports any TRT plugins registered in TRT registry in TRT plugin library (i.e., `libnvinfer_plugin.so`). Moreover, if users want to use other TRT plugins that are not in TRT plugin library, for example, FasterTransformer has many TRT plugin implementations for different models, user can specify like this `ORT_TENSORRT_EXTRA_PLUGIN_LIB_PATHS=libvit_plugin.so;libvit_int8_plugin.so`.
 
 * `ORT_TENSORRT_PROFILE_MIN_SHAPES`, `ORT_TENSORRT_PROFILE_MAX_SHAPES` and `ORT_TENSORRT_PROFILE_OPT_SHAPES` : Build with dynamic shapes using a profile with the min/max/opt shapes provided. The format of the profile shapes is "input_tensor_1:dim_1xdim_2x...,input_tensor_2:dim_3xdim_4x...,..." and these three flags should all be provided in order to enable explicit profile shapes feature. Check [Explicit shape range for dynamic shape input](#explicit-shape-range-for-dynamic-shape-input) and TRT doc [optimization profiles](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#opt_profiles) for more details.
 
@@ -320,7 +367,7 @@ One can override default values by setting environment variables. e.g. on Linux:
 # Override default max workspace size to 2GB
 export ORT_TENSORRT_MAX_WORKSPACE_SIZE=2147483648
 
-# Override default maximum number of iterations to 10 
+# Override default maximum number of iterations to 10
 export ORT_TENSORRT_MAX_PARTITION_ITERATIONS=10
 
 # Override default minimum subgraph node size to 5
@@ -337,7 +384,7 @@ export ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE=1
 
 # Enable TensorRT engine caching
 export ORT_TENSORRT_ENGINE_CACHE_ENABLE=1
-# Please Note warning above. This feature is experimental. 
+# Please Note warning above. This feature is experimental.
 # Engine cache files must be invalidated if there are any changes to the model, ORT version, TensorRT version or if the underlying hardware changes. Engine files are not portable across devices.
 
 # Specify TensorRT cache path
@@ -362,7 +409,7 @@ If some operators in the model are not supported by TensorRT, ONNX Runtime will
 
 ### TensorRT Plugins Support
 ORT TRT can leverage the TRT plugins which come with TRT plugin library in official release. To use TRT plugins, firstly users need to create the custom node (a one-to-one mapping to TRT plugin) with a registered plugin name and `trt.plugins` domain in the ONNX model. So, ORT TRT can recognize this custom node and pass the node together with the subgraph to TRT. Please see following python example to create a new custom node in the ONNX model:
- 
+
 Click below for Python API example:
 
 <details>
@@ -405,7 +452,9 @@ def generate_model(model_name):
 Note: If users want to use TRT plugins that are not in the TRT plugin library in official release, please see the ORT TRT provider option `trt_extra_plugin_lib_paths` for more details.
 
 ### Timing cache
-Enabling `trt_timing_cache_enable` will enable ORT TRT to use TensorRT timing cache to accelerate engine build time on a device with the same compute capability. This will work across models as it simply stores kernel latencies for specific configurations. Those files are usually very small (only a few KB or MB) which makes them very easy to ship with an application to accelerate the build time on the user end.
+Enabling `trt_timing_cache_enable` will enable ORT TRT to use TensorRT timing cache to accelerate engine build time on a device with the same compute capability. This will work across models as it simply stores kernel latencies for specific configurations and cubins (TRT 9.0+). Those files are usually very small (only a few KB or MB) which makes them very easy to ship with an application to accelerate the build time on the user end.
+
+_Note:_ A timing cache can be used across one [GPU compute capability](https://developer.nvidia.com/cuda-gpus) similar to an engine. Nonetheless the preferred way is to use one per GPU model, but practice shows that sharing across one compute capability works well in most cases.
 
 The following examples shows build time reduction with timing cache:
 
@@ -448,9 +497,9 @@ sess.run(
 
 </details>
 
-### Explicit shape range for dynamic shape input 
+### Explicit shape range for dynamic shape input
 
-ORT TRT lets you explicitly specify min/max/opt shapes for each dynamic shape input through three provider options, `trt_profile_min_shapes`, `trt_profile_max_shapes` and `trt_profile_opt_shapes`. If these three provider options are not specified 
+ORT TRT lets you explicitly specify min/max/opt shapes for each dynamic shape input through three provider options, `trt_profile_min_shapes`, `trt_profile_max_shapes` and `trt_profile_opt_shapes`. If these three provider options are not specified
 and model has dynamic shape input, ORT TRT will determine the min/max/opt shapes for the dynamic shape input based on incoming input tensor. The min/max/opt shapes are required for TRT optimization profile (An optimization profile describes a range of dimensions for each TRT network input and the dimensions that the auto-tuner will use for optimization. When using runtime dimensions, you must create at least one optimization profile at build time.)
 
 To use the engine cache built with optimization profiles specified by explicit shape ranges, user still needs to provide those three provider options as well as engine cache enable flag.
@@ -522,16 +571,16 @@ This example shows how to run the Faster R-CNN model on TensorRT execution provi
     ```
 
 3. To test model with sample input and verify the output, run `onnx_test_runner` under ONNX Runtime build directory.
-   
+
     > Models and test_data_set_ folder need to be stored under the same path. `onnx_test_runner` will test all models under this path.
 
     ```bash
     ./onnx_test_runner -e tensorrt /path/to/onnx/model/
     ```
 
-4. To test on model performance, run `onnxruntime_perf_test` on your shape-inferred Faster-RCNN model 
+4. To test on model performance, run `onnxruntime_perf_test` on your shape-inferred Faster-RCNN model
 
-   > Download sample test data with model from [model zoo](https://github.com/onnx/models/tree/main/vision/object_detection_segmentation/faster-rcnn), and put test_data_set folder next to your inferred model 
+   > Download sample test data with model from [model zoo](https://github.com/onnx/models/tree/main/vision/object_detection_segmentation/faster-rcnn), and put test_data_set folder next to your inferred model
 
     ```bash
     # e.g.
@@ -542,4 +591,3 @@ This example shows how to run the Faster R-CNN model on TensorRT execution provi
     ```
 
 Please see [this Notebook](https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb) for an example of running a model on GPU using ONNX Runtime through Azure Machine Learning Services.
-
diff --git a/docs/execution-providers/Vitis-AI-ExecutionProvider.md b/docs/execution-providers/Vitis-AI-ExecutionProvider.md
index a0e408696381a..655b563bcaff4 100644
--- a/docs/execution-providers/Vitis-AI-ExecutionProvider.md
+++ b/docs/execution-providers/Vitis-AI-ExecutionProvider.md
@@ -35,12 +35,12 @@ The following table lists AMD targets that are supported by the Vitis AI ONNX Ru
 AMD Adaptable SoC developers can also leverage the Vitis AI ONNX Runtime Execution Provider to support custom (chip-down) designs.
 
 ## Ryzen AI Demo
-A [demonstration](https://github.com/amd/RyzenAI-cloud-to-client-demo) is available that showcases one potential application of AMD's XDNA technology.  New users are encouraged to test it out.
+A [demonstration](https://github.com/amd/RyzenAI-SW/tree/main/demo/cloud-to-client) is available that showcases one potential application of AMD's XDNA technology.  New users are encouraged to test it out.
 
 ## Install
 
 ### AMD Adaptable SoC Installation
-For AMD Adaptable SoC targets, a pre-built package is provided to deploy ONNX models on embedded Linux.  Users should refer to the standard Vitis AI [Target Setup Instructions](https://xilinx.github.io/Vitis-AI/3.0/html/docs/workflow.html) to enable Vitis AI on the target.  Once Vitis AI has been enabled on the target, the developer can refer to [this section](https://docs.xilinx.com/r/en-US/ug1414-vitis-ai/Programming-with-VOE) of the Vitis AI documentation for installation and API details.
+For AMD Adaptable SoC targets, a pre-built package is provided to deploy ONNX models on embedded Linux.  Users should refer to the standard Vitis AI [Target Setup Instructions](https://xilinx.github.io/Vitis-AI/3.5/html/docs/workflow.html) to enable Vitis AI on the target.  Once Vitis AI has been enabled on the target, the developer can refer to [this section](https://docs.xilinx.com/r/en-US/ug1414-vitis-ai/Programming-with-VOE) of the Vitis AI documentation for installation and API details.
 
 For more complete examples, developers should refer to [ONNX Runtime Vitis AI Execution Provider examples](https://github.com/Xilinx/Vitis-AI/tree/master/examples/vai_library/samples_onnx).
 
@@ -48,25 +48,69 @@ For more complete examples, developers should refer to [ONNX Runtime Vitis AI Ex
 
 To enable the Vitis AI ONNX Runtime Execution Provider in Microsoft Windows, a .zip archive is provided.
 
+The developers can refer to the installation section of [Ryzen AI Documentation](https://ryzenai.docs.amd.com/en/latest/inst.html) for more detailed instruction.
+
 The contents of this archive are as follows:
 
 ```
-voe-3.5-win_amd64.zip
-	│
-	├── Examples  
-	│    │    
-	│    └── resnet50_python                               # ONNX ResNet50 ILSVRC2012
-	├── voe-0.1.0-cp39-cp39-win_amd64                      # Folder containing DLLs for C++ API
-	│    │    
-	│    ├── onnxruntime.dll                                    
-	│    ├── onnxruntime_vitisai_ep.dll                            
-	├── vaip_config.json                       	       # Runtime configuration file
-        ├── installer.py                                       # Python script to copy dlls 
-        ├── voe-0.1.0-cp39-cp39-win_amd64.whl      	       # Python installation package
-        ├── onnxruntime_vitisai-1.15.1-cp39-cp39-win_amd64.whl # Python installation package
-        ├── 1x4.xclbin                                         # IPU Execution Provider executable file
-        ├── 5x4.xclbin                                         # IPU Execution Provider executable file
-	│    
+ryzen-ai-sw-1.0.zip
+├── env.yaml
+├── install.bat
+├── quicktest
+│   ├── image_0.png
+│   ├── quicktest.py
+│   ├── quicktest_random.py
+│   ├── README.md
+│   ├── requirements.txt
+│   └── test_model.onnx
+├── requirements_ryzenai_voe.txt
+├── vai_q_onnx-1.16.0+be3c70b-py2.py3-none-any.whl
+└── voe-4.0-win_amd64
+    ├── 1x4.xclbin
+    ├── 4x4.xclbin
+    ├── aieml_gemm_asr.json
+    ├── aieml_gemm_asr_qdq.json
+    ├── aieml_gemm_asr_qdq.xclbin
+    ├── aieml_gemm_asr.xclbin
+    ├── aieml_gemm_vm_phx_4x4_bf16.json
+    ├── aieml_gemm_vm_phx_4x4_bf16.xclbin
+    ├── aieml_gemm_vm_phx_4x4.json
+    ├── aieml_gemm_vm_phx_4x4.xclbin
+    ├── flexml_vaiep
+    │   ├── binary-modules
+    │   │   └── ResNet.flexml
+    │   │       ├── 0
+    │   │       │   ├── ctrl_pkt0.bin
+    │   │       │   ├── ctrlPktPatch.json
+    │   │       │   ├── ctrl_pkts.xclbin
+    │   │       │   ├── flexmlrt-hsi.json
+    │   │       │   ├── ml_txn.bin
+    │   │       │   └── wts32.txt
+    │   │       ├── flexml_bm.signature
+    │   │       ├── libflexml_usermodel.dll
+    │   │       └── partition-info.json
+    │   ├── demo
+    │   │   ├── imagenet-classes.txt
+    │   │   ├── requirements.txt
+    │   │   ├── setup.bat
+    │   │   ├── test-single-image.py
+    │   │   ├── utils
+    │   │   │   ├── image_utils.py
+    │   │   │   ├── __init__.py
+    │   │   │   └── onnx.py
+    │   │   └── YellowLabradorLooking_new.jpg
+    │   ├── lib
+    │   │   └── flexmlrt.dll
+    │   ├── onnx-models
+    │   │   └── resnet50_v1_3_vaiq.onnx
+    │   └── README.md
+    ├── installer.py
+    ├── onnxruntime_vitisai-1.15.1-cp39-cp39-win_amd64.whl
+    ├── vaip_config.json
+    ├── voe-0.1.0-cp39-cp39-win_amd64
+    │   ├── onnxruntime.dll
+    │   └── onnxruntime_vitisai_ep.dll
+    └── voe-0.1.0-cp39-cp39-win_amd64.whl
 ```
 
 **_NOTE:_** Ryzen AI Linux support is not enabled in this release.
@@ -75,21 +119,22 @@ Both C++ and Python APIs are supported.  The following instructions assume that
 
 **1. Verify Pre-requisites:**
 
-- Visual Studio = 2022
+- Visual Studio = 2019
 - cmake (version >= 3.26)
 - python (version >= 3.9) (Python 3.9.13 64bit recommended)
-- AMD IPU driver >= 10.105.5.38 installed
+- AMD IPU driver = 10.1109.8.100 installed
 
 **2. Prepare the files for installation:**
 
-- Download the [Ryzen AI ONNX Runtime Package](https://account.amd.com/en/forms/downloads/ryzen-ai-software-platform-xef.html?filename=voe-3.5-win_amd64.zip).  
-- Unzip `voe-[version]-win_amd64.zip`.  
+- Download the [Ryzen AI Software Package](https://account.amd.com/en/forms/downloads/ryzen-ai-software-platform-xef.html?filename=ryzen-ai-sw-1.0.zip).  
+- Unzip `ryzen-ai-sw-1.0.zip`.
+- Enter the `voe-4.0-win_amd64` ONNX runtime folder.
 
 **3. Install the C++ Libraries:**
 
 **_NOTE:_** This is an optional step which is only required if you intend to use the C++ APIs.
 
-- Copy the DLL files from the bin subdirectory of the extracted archive to `C:\Program Files\onnxruntime\bin`, (ie `copy bin\*.dll C:\Program Files\onnxruntime\bin`).  This will install the Vitis AI ONNX Runtime Engine libraries.  
+- Copy the DLL files from the `voe-0.1.0-cp39-cp39-win_amd64` subdirectory of the extracted archive to `C:\Program Files\onnxruntime\bin`, (ie `copy voe-0.1.0-cp39-cp39-win_amd64\*.dll C:\Program Files\onnxruntime\bin`).  This will install the Vitis AI ONNX Runtime Engine libraries.  
 
 **4. Set the XLNX_VART_FIRMWARE environmental variable:**
 
@@ -109,7 +154,7 @@ pip install voe-[version]-cp39-cp39-win_amd64.whl
 
 **4. Run the ResNet50 example:**
 
-- Leverage the scripts in the `Examples\resnet50_python` folder to test your installation.  
+- Leverage the scripts in the `quicktest` folder to test your installation.  
 
 
 ## Build
@@ -126,7 +171,7 @@ The current release of the Vitis AI Execution Provider ingests quantized ONNX mo
  
 [Pytorch](https://hub.docker.com/r/amdih/ryzen-ai-pytorch), [Tensorflow 2.x](https://hub.docker.com/r/amdih/ryzen-ai-tensorflow2) and [Tensorflow 1.x](https://hub.docker.com/r/amdih/ryzen-ai-tensorflow) dockers are avaialable to support quantization of PyTorch and TensorFlow models.  To support the Vitis AI ONNX Runtime Execution Provider, an option is provided in the Vitis AI Quantizer to export a quantized model in ONNX format, post quantization.
 
-[ONNX Quantizer python wheel](https://www.xilinx.com/bin/public/openDownload?filename=vai_q_onnx-1.14.0-py2.py3-none-any.whl) is available to parse and quantize ONNX models, enabling an end-to-end ONNX model -> ONNX Runtime workflow.  Also, in a future release, the Vitis AI ONNX Runtime Execution Provider will support on-the-fly quantization, enabling direct deployment of FP32 ONNX models.
+**ONNX Quantizer python wheel** is available to parse and quantize ONNX models, enabling an end-to-end ONNX model -> ONNX Runtime workflow which is provided in the [Ryzen AI Software Package](https://account.amd.com/en/forms/downloads/ryzen-ai-software-platform-xef.html?filename=ryzen-ai-sw-1.0.zip) as well.  Also, in a future release, the Vitis AI ONNX Runtime Execution Provider will support on-the-fly quantization, enabling direct deployment of FP32 ONNX models.
 
 See [Model Quantization](https://xilinx.github.io/Vitis-AI/3.5/html/docs/workflow-model-development.html#model-quantization) for details.
 
diff --git a/docs/tutorials/csharp/bert-nlp-csharp-console-app.md b/docs/tutorials/csharp/bert-nlp-csharp-console-app.md
index ac42131dace19..488bf8b39c431 100644
--- a/docs/tutorials/csharp/bert-nlp-csharp-console-app.md
+++ b/docs/tutorials/csharp/bert-nlp-csharp-console-app.md
@@ -138,8 +138,8 @@ Now that we have tested the model in Python its time to build it out in C#. The
 ```PowerShell
 dotnet add package Microsoft.ML.OnnxRuntime --version 1.16.0
 dotnet add package Microsoft.ML.OnnxRuntime.Managed --version 1.16.0
-dotnet add package dotnet add package Microsoft.ML
-dotnet add package dotnet add package BERTTokenizers --version 1.1.0
+dotnet add package Microsoft.ML
+dotnet add package BERTTokenizers --version 1.1.0
 ```
 
 ### Create the App
diff --git a/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png
index 4646c06045023..6588146b16ad3 100644
Binary files a/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png and b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png
index eecce4751cbd0..79c2efbe634db 100644
Binary files a/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png and b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png
index b8cdf011dfec3..3bbc358d1486f 100644
Binary files a/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png and b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency 1.png b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency 1.png
deleted file mode 100644
index 814a8e6d81a02..0000000000000
Binary files a/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency 1.png and /dev/null differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency.png b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency.png
new file mode 100644
index 0000000000000..996796482ba2f
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png
index b5ea1e1c5f918..ae1771bd5e13e 100644
Binary files a/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png and b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png
index 0bc90a464bc71..72dd1f6995cea 100644
Binary files a/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png and b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png differ
diff --git a/src/routes/blogs/accelerating-llama-2/+page.svelte b/src/routes/blogs/accelerating-llama-2/+page.svelte
index 0f5add02a8d4b..3164ebbf2c007 100644
--- a/src/routes/blogs/accelerating-llama-2/+page.svelte
+++ b/src/routes/blogs/accelerating-llama-2/+page.svelte
@@ -1,13 +1,13 @@
 <script>
 	import Header from '../../components/header.svelte';
 	import Footer from '../../components/footer.svelte';
-    import figure1 from '../../../images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png'
-    import figure1b from '../../../images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png'
-    import figure2 from '../../../images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency 1.png'
-    import figure2b from '../../../images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png'
-    import figure3 from '../../../images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png'
-    import figure3b from '../../../images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png'
-    import figure4 from '../../../images/blogs/accelerating-llama-2/Figure4-LLaMA-2-70B-Model-Throughput.png'
+	import figure1 from '../../../images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png';
+	import figure1b from '../../../images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png';
+	import figure2 from '../../../images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency.png';
+	import figure2b from '../../../images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png';
+	import figure3 from '../../../images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png';
+	import figure3b from '../../../images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png';
+	import figure4 from '../../../images/blogs/accelerating-llama-2/Figure4-LLaMA-2-70B-Model-Throughput.png';
 	import figure5 from '../../../images/blogs/accelerating-llama-2/LLaMA-2OptimizationDiagram-5.png';
 	import figure6 from '../../../images/blogs/accelerating-llama-2/LLaMAWindowsExportRotaryEmbeddingSubgraph-6.jpg';
 	import figure7 from '../../../images/blogs/accelerating-llama-2/RotaryEmbeddingFunctionExample-7.png';
@@ -18,12 +18,40 @@
 		name="description"
 		content="Explore how ONNX Runtime can propel your Llama2 variants for faster inference."
 	/>
+	<meta
+		name="keywords"
+		content="Accelerating LLaMA-2, ONNX Runtime, Inference, AI, Microsoft, Meta, Llama2, Performance Optimization, Multi-GPU Inference"
+	/>
+	<meta name="author" content="Kunal Vaishnavi, Parinita Rahi" />
+	<meta name="date" content="2023-11-14" />
+	<meta property="og:title" content="Accelerating LLaMA-2 Inference with ONNX Runtime" />
+	<meta
+		property="og:description"
+		content="Explore how ONNX Runtime accelerates LLaMA-2 inference, achieving up to 3.8X faster performance for models ranging from 7B to 70B parameters. Learn about graph fusions, kernel optimizations, multi-GPU inference support, and more."
+	/>
+	<meta property="og:type" content="article" />
+	<meta property="og:url" content="https://onnxruntime.ai/blogs/accelerating-llama-2" />
+	<meta property="og:image" content={figure1b} />
+	<meta property="og:site_name" content="ONNX Runtime" />
+	<meta name="twitter:card" content={figure1b} />
+	<meta name="twitter:title" content="Accelerating LLaMA-2 Inference with ONNX Runtime" />
+	<meta
+		name="twitter:description"
+		content="Explore how ONNX Runtime can propel your Llama2 variants for faster inference."
+	/>
+	<meta name="twitter:image" content={figure1b} />
 </svelte:head>
 <Header pathvar="" />
 <div class="container mx-auto px-4 md:px-8 lg:px-48 pt-8">
 	<h1 class="text-5xl pb-2">Accelerating LLaMA-2 Inference with ONNX Runtime</h1>
-	<p class="text-neutral">By: <a href="https://www.linkedin.com/in/kunal-v-16315b94" class="text-blue-500">Kunal Vaishnavi</a> and <a href="https://www.linkedin.com/in/parinitaparinita/" class="text-blue-500">Parinita Rahi</a> </p>
-	<p class="text-neutral">14TH NOVEMBER, 2023</p>
+	<p class="text-neutral">
+		By: <a href="https://www.linkedin.com/in/kunal-v-16315b94" class="text-blue-500"
+			>Kunal Vaishnavi</a
+		>
+		and
+		<a href="https://www.linkedin.com/in/parinitaparinita/" class="text-blue-500">Parinita Rahi</a>
+	</p>
+	<p class="text-neutral">14TH NOVEMBER, 2023 <span class="italic text-stone-500">(Updated 22nd November)</span></p>
 	<div class="py-4">
 		<p class="mb-4">
 			Interested in running Llama2 faster? Let us explore how ONNX Runtime can propel your Llama2
@@ -31,7 +59,7 @@
 		</p>
 
 		<p class="mb-4">
-			You can now experience significant inference gains—up to 4X faster—for the 7B, 13B, and 70B
+			You can now experience significant inference gains—up to 3.8X faster—for the 7B, 13B, and 70B
 			models, thanks to state-of-the-art fusion and kernel optimizations with ONNX Runtime. This
 			blog details performance enhancements, dives into ONNX Runtime fusion optimizations, multi-GPU
 			inferencing support, and guides you on how to leverage the cross-platform prowess of ONNX
@@ -44,11 +72,13 @@
 
 		<p class="mb-4">
 			Llama2 is a state-of-the-art open source LLM from Meta ranging in scale from 7B to 70B
-			parameters (7B, 13B, 70B). Microsoft and Meta <a href="https://blogs.microsoft.com/blog/2023/07/18/microsoft-and-meta-expand-their-ai-partnership-with-llama-2-on-azure-and-windows/" class="text-blue-500">announced</a> their AI on Azure and Windows
-			collaboration in July 2023. As part of the announcement, Llama2 was added to the Azure AI
-			model catalog, which serves as a hub of foundation models that empower developers and machine
-			learning (ML) professionals to easily discover, evaluate, customize, and deploy pre-built
-			large AI models at scale.
+			parameters (7B, 13B, 70B). Microsoft and Meta <a
+				href="https://blogs.microsoft.com/blog/2023/07/18/microsoft-and-meta-expand-their-ai-partnership-with-llama-2-on-azure-and-windows/"
+				class="text-blue-500">announced</a
+			> their AI on Azure and Windows collaboration in July 2023. As part of the announcement, Llama2
+			was added to the Azure AI model catalog, which serves as a hub of foundation models that empower
+			developers and machine learning (ML) professionals to easily discover, evaluate, customize, and
+			deploy pre-built large AI models at scale.
 		</p>
 
 		<p class="mb-4">
@@ -65,95 +95,98 @@
 			As part of the new 1.16.2 release, ONNX Runtime now has several built-in optimizations for
 			Llama2, including graph fusions and kernel optimizations. The inference speedups, when
 			compared to Hugging Face (HF) variants of Llama2 in PyTorch compile mode for prompt latency of
-			CUDA FP16, are mentioned below. We see ~3X gains in end-to-end throughput comparisons for both
-			7B and 13B models. The end-to-end throughput or wall-clock throughput shown below is defined
-			as <i>batch size * (prompt length + token generation length) / wall-clock latency</i> where
-			wall-clock latency = the latency from running end-to-end and token generation length = 256
-			generated tokens. The E2E throughput is up to 4.5X more when compared to PyTorch compile.
+			CUDA FP16, are mentioned below. The end-to-end throughput or wall-clock throughput shown below
+			is defined as <i
+				>batch size * (prompt length + token generation length) / wall-clock latency</i
+			> where wall-clock latency = the latency from running end-to-end and token generation length =
+			256 generated tokens. The E2E throughput is 2.4X more (13B) and 1.8X more (7B) when compared to
+			PyTorch compile. For higher batch size, sequence length like 16, 2048 pytorch eager times out,
+			while ORT shows better performance than compile mode.
 		</p>
-        <div class="grid grid-cols-1 lg:grid-cols-2 gap-4">
-            <figure class="px-10 pt-4">
-                <img src={figure1} alt="E2E Throughput Comparisons - Llama-2-7b" />
-            </figure>
-            <figure class="px-10 pt-4 my-auto">
-                <img src={figure1b} alt="E2E Throughput Comparisons - Llama-2-13b" />
-            </figure>
-        </div>
-        <div class="mt-2 mb-4 text-center">
-            Figure 1: E2E Throughput Comparisons
-        </div>
+		<div class="grid grid-cols-1 lg:grid-cols-2 gap-4">
+			<figure class="px-10 pt-4">
+				<img src={figure1} alt="E2E Throughput Comparisons - Llama-2-7b" />
+			</figure>
+			<figure class="px-10 pt-4 my-auto">
+				<img src={figure1b} alt="E2E Throughput Comparisons - Llama-2-13b" />
+			</figure>
+		</div>
+		<div class="mt-2 mb-4 text-center">Figure 1: E2E Throughput Comparisons</div>
 
 		<h2 class="text-blue-500 text-3xl mb-4">Latency and Throughput</h2>
 
 		<p class="mb-4">
-			The graphs below show latency comparisons between the ONNX Runtime and PyTorch variants of the Llama2
-			7B model on CUDA FP16. Latency here is defined as the time it takes to complete one pass
-			through the model to produce the logits and synchronize the outputs.
+			The graphs below show latency comparisons between the ONNX Runtime and PyTorch variants of the
+			Llama2 7B model on CUDA FP16. Latency here is defined as the time it takes to complete one
+			pass through the model to produce the logits and synchronize the outputs.
 		</p>
-        
-        <div class="grid grid-cols-1 lg:grid-cols-2 gap-4">
-            <figure class="px-10 pt-4">
-                <img src={figure2} alt="Prompt Latency Comparisons - Llama-2-7b" />
-            </figure>
-            <figure class="px-10 pt-4 my-auto">
-                <img src={figure2b} alt="Prompt Latency Comparisons - Llama-2-13b" />
-            </figure>
-        </div>
-        <div class="mt-2 mb-4 text-center">
-            Figure 2: Prompt Latency Comparisons
-        </div>
+
+		<div class="grid grid-cols-1 lg:grid-cols-2 gap-4">
+			<figure class="px-10 pt-4">
+				<img src={figure2} alt="Prompt Latency Comparisons - Llama-2-7b" />
+			</figure>
+			<figure class="px-10 pt-4 my-auto">
+				<img src={figure2b} alt="Prompt Latency Comparisons - Llama-2-13b" />
+			</figure>
+		</div>
+		<div class="mt-2 mb-4 text-center">Figure 2: Prompt Latency Comparisons</div>
 
 		<p class="mb-4">
-			Token generation throughput below is the average throughput of the first 128 tokens generated.
-			We see up to 3.5X gains in token generation throughput when compared to PyTorch eager and
-			compile modes.
+			Token generation throughput below is the average throughput of the first 256 tokens generated.
+			We see up to ~1.3X (7B) and ~1.5X (13B) gains in token generation throughput when compared to
+			PyTorch compile mode.
 		</p>
 
-        <div class="grid grid-cols-1 lg:grid-cols-2 gap-4">
-            <figure class="px-10 pt-4">
-                <img src={figure3} alt="Tokens Generated Throughput Comparisons - Llama-2-7b" />
-            </figure>
-            <figure class="px-10 pt-4 my-auto">
-                <img src={figure3b} alt="Tokens Generated Throughput Comparisons - Llama-2-13b" />
-            </figure>
-        </div>
-        <div class="mt-2 mb-4 text-center">
-            Figure 3: Tokens Generated Throughput Comparisons
-        </div>
+		<div class="grid grid-cols-1 lg:grid-cols-2 gap-4">
+			<figure class="px-10 pt-4">
+				<img src={figure3} alt="Tokens Generated Throughput Comparisons - Llama-2-7b" />
+			</figure>
+			<figure class="px-10 pt-4 my-auto">
+				<img src={figure3b} alt="Tokens Generated Throughput Comparisons - Llama-2-13b" />
+			</figure>
+		</div>
+		<div class="mt-2 mb-4 text-center">Figure 3: Tokens Generated Throughput Comparisons</div>
 
-		<p class="mb-4">More details on these metrics can be found <a href="https://github.com/microsoft/onnxruntime-inference-examples/blob/main/python/models/llama2/README.md" class="text-blue-500">here</a>.</p>
+		<p class="mb-4">
+			More details on these metrics can be found <a
+				href="https://github.com/microsoft/onnxruntime-inference-examples/blob/main/python/models/llama2/README.md"
+				class="text-blue-500">here</a
+			>.
+		</p>
 
 		<h2 class="text-blue-500 text-3xl mb-4">ONNX Runtime with Multi-GPU Inference</h2>
 
 		<p class="mb-4">
-			ONNX Runtime supports multi-GPU inference to enable serving large models. Even in FP16 precision,
-			the LLaMA-2 70B model requires 140GB. Loading the model requires multiple GPUs 
-			for inference, even with a powerful NVIDIA A100 80GB GPU.
+			ONNX Runtime supports multi-GPU inference to enable serving large models. Even in FP16
+			precision, the LLaMA-2 70B model requires 140GB. Loading the model requires multiple GPUs for
+			inference, even with a powerful NVIDIA A100 80GB GPU.
 		</p>
 
 		<p class="mb-4">
-			ONNX Runtime applied <a href="https://arxiv.org/pdf/1909.08053.pdf" class="text-blue-500">Megatron-LM</a> Tensor Parallelism on the 70B model to split the
-			original model weight onto different GPUs. Megatron sharding on the 70B model
-			shards the PyTorch model with FP16 precision into 4 partitions, converts each partition into ONNX
-			format, and then applies a new ONNX Runtime graph fusion on the converted ONNX model. The 70B
-			model has ~30 tokens per second throughput for token generation at batch size 1, and
-			end-to-end throughput starts at 30 ms for smaller sequence lengths with these optimizations.
-			You can find additional example scripts <a href="https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/llama/" class="text-blue-500">here</a>.
+			ONNX Runtime applied <a href="https://arxiv.org/pdf/1909.08053.pdf" class="text-blue-500"
+				>Megatron-LM</a
+			>
+			Tensor Parallelism on the 70B model to split the original model weight onto different GPUs. Megatron
+			sharding on the 70B model shards the PyTorch model with FP16 precision into 4 partitions, converts
+			each partition into ONNX format, and then applies a new ONNX Runtime graph fusion on the converted
+			ONNX model. The 70B model has ~30 tokens per second throughput for token generation at batch size
+			1, and end-to-end throughput starts at 30 tps for smaller sequence lengths with these optimizations.
+			You can find additional example scripts
+			<a
+				href="https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/llama/"
+				class="text-blue-500">here</a
+			>.
 		</p>
-        
-        <figure class="px-10 pt-4">
-			<img src={figure4} alt="70B Llama2 Model Throughput" class="w-3/5 mx-auto"/>
-			<figcaption class="mt-2 mb-4 text-center">
-				Figure 4: 70B Llama2 Model Throughput
-			</figcaption>
+
+		<figure class="px-10 pt-4">
+			<img src={figure4} alt="70B Llama2 Model Throughput" class="w-3/5 mx-auto" />
+			<figcaption class="mt-2 mb-4 text-center">Figure 4: 70B Llama2 Model Throughput</figcaption>
 		</figure>
 
 		<h2 class="text-blue-500 text-3xl mb-4">ONNX Runtime Optimizations</h2>
 		<figure class="px-10 pt-4">
 			<img src={figure5} alt="LLaMA-2 Optimization Diagram" />
-			<figcaption class="mt-2 mb-4 text-center">
-				Figure 5: LLaMA-2 Optimization Diagram
-			</figcaption>
+			<figcaption class="mt-2 mb-4 text-center">Figure 5: LLaMA-2 Optimization Diagram</figcaption>
 		</figure>
 
 		<p class="mb-4">
@@ -165,7 +198,7 @@
 			fusion opportunities can instead be identified by exporting a large module as a function and
 			then pattern matching against a function's spec.
 		</p>
-        <figure class="px-10 pt-4">
+		<figure class="px-10 pt-4">
 			<img src={figure6} alt="Example of Rotary Embedding Function" />
 			<figcaption class="mt-2 mb-4 text-center">
 				Figure 6: Example of Rotary Embedding Function
@@ -179,7 +212,7 @@
 			the inputs and outputs and represent all these nodes as a single operator.
 		</p>
 
-        <figure class="px-10 pt-4">
+		<figure class="px-10 pt-4">
 			<img src={figure7} alt="Example of Rotary Embedding Function in Parent Graph" />
 			<figcaption class="mt-2 mb-4 text-center">
 				Figure 7: Example of Rotary Embedding Function in Parent Graph
@@ -195,11 +228,11 @@
 		</p>
 
 		<p class="mb-4">
-			ONNX Runtime also adds support for the GroupQueryAttention (GQA) operator, which leverages the new
-			Flash Attention V2 algorithm and its optimized kernels to efficiently compute attention. The
-			GQA operator supports past-present buffer sharing between the past key/value cache (past KV
-			cache) and the present key/value cache (present KV cache). By binding the present KV caches to
-			the past KV caches, there is no need to allocate separate on-device memory for both caches.
+			ONNX Runtime also adds support for the GroupQueryAttention (GQA) operator, which leverages the
+			new Flash Attention V2 algorithm and its optimized kernels to efficiently compute attention.
+			The GQA operator supports past-present buffer sharing between the past key/value cache (past
+			KV cache) and the present key/value cache (present KV cache). By binding the present KV caches
+			to the past KV caches, there is no need to allocate separate on-device memory for both caches.
 			Instead, the past KV caches can be pre-allocated with enough on-device memory so that no new
 			on-device memory needs to be requested during inference. This reduces memory usage when the KV
 			caches become large during compute-intensive workloads and lowers latency by eliminating
@@ -209,20 +242,29 @@
 		</p>
 
 		<p class="mb-4">
-			In addition to these fusions and kernel optimizations, ONNX Runtime reduces the model’s memory usage. 
-			Besides quantization improvements (which will be covered in a future post), ONNX Runtime compresses the 
-			size of the cosine and sine caches used in each of the rotary embeddings by 50%. The compute kernels in 
-			ONNX Runtime that run the rotary embedding computations can then recognize this format and use their 
-			parallelized implementations to calculate the rotary embeddings more efficiently with less memory usage. 
-			The rotary embedding compute kernels also support interleaved and non-interleaved formats to support both 
-			the <a href="https://github.com/microsoft/Llama-2-Onnx" class="text-blue-500">Microsoft version of LLaMA-2</a>
+			In addition to these fusions and kernel optimizations, ONNX Runtime reduces the model’s memory
+			usage. Besides quantization improvements (which will be covered in a future post), ONNX
+			Runtime compresses the size of the cosine and sine caches used in each of the rotary
+			embeddings by 50%. The compute kernels in ONNX Runtime that run the rotary embedding
+			computations can then recognize this format and use their parallelized implementations to
+			calculate the rotary embeddings more efficiently with less memory usage. The rotary embedding
+			compute kernels also support interleaved and non-interleaved formats to support both the <a
+				href="https://github.com/microsoft/Llama-2-Onnx"
+				class="text-blue-500">Microsoft version of LLaMA-2</a
+			>
 			and the Hugging Face version of LLaMA-2 respectively while sharing the same calculations.
 		</p>
 
 		<p class="mb-4">
-			The optimizations work for the <a href="https://huggingface.co/meta-llama" class="text-blue-500">Hugging Face versions</a> (models ending with <i>-hf</i>) and the Microsoft versions. 
-			You can download the optimized HF versions from <a href="https://github.com/microsoft/Llama-2-Onnx/tree/main-CUDA_CPU" class="text-blue-500">Microsoft's LLaMA-2 ONNX repository</a>. Stay tuned for
-			newer Microsoft versions coming soon!
+			The optimizations work for the <a
+				href="https://huggingface.co/meta-llama"
+				class="text-blue-500">Hugging Face versions</a
+			>
+			(models ending with <i>-hf</i>) and the Microsoft versions. You can download the optimized HF
+			versions from
+			<a href="https://github.com/microsoft/Llama-2-Onnx/tree/main-CUDA_CPU" class="text-blue-500"
+				>Microsoft's LLaMA-2 ONNX repository</a
+			>. Stay tuned for newer Microsoft versions coming soon!
 		</p>
 
 		<h2 class="text-blue-500 text-3xl mb-4">Optimize your own model using Olive</h2>
@@ -235,19 +277,24 @@
 		</p>
 
 		<p class="mb-4">
-			Here is an example of <a href="https://github.com/microsoft/Olive/tree/main/examples/llama2" class="text-blue-500">Llama2 optimization with Olive</a>, which harnesses ONNX Runtime
-			optimizations highlighted in this blog. Distinct optimization flows cater to various
-			requirements. For instance, you have the flexibility to choose different data types for
-			quantization in CPU and GPU inference, based on your accuracy tolerance. Additionally, you can
-			fine-tune your own Llama2 model with Olive-QLoRa on client GPUs and perform inference with
-			ONNX Runtime optimizations.
+			Here is an example of <a
+				href="https://github.com/microsoft/Olive/tree/main/examples/llama2"
+				class="text-blue-500">Llama2 optimization with Olive</a
+			>, which harnesses ONNX Runtime optimizations highlighted in this blog. Distinct optimization
+			flows cater to various requirements. For instance, you have the flexibility to choose
+			different data types for quantization in CPU and GPU inference, based on your accuracy
+			tolerance. Additionally, you can fine-tune your own Llama2 model with Olive-QLoRa on client
+			GPUs and perform inference with ONNX Runtime optimizations.
 		</p>
 
 		<h2 class="text-blue-500 text-3xl mb-4">Usage Example</h2>
 
 		<p class="mb-4">
-			Here is a <a href="https://github.com/microsoft/onnxruntime-inference-examples/blob/main/python/models/llama2/LLaMA-2%20E2E%20Notebook.ipynb" class="text-blue-500">sample notebook</a> that shows you an end-to-end example of how you can use the above
-			ONNX Runtime optimizations in your application.
+			Here is a <a
+				href="https://github.com/microsoft/onnxruntime-inference-examples/blob/main/python/models/llama2/LLaMA-2%20E2E%20Notebook.ipynb"
+				class="text-blue-500">sample notebook</a
+			> that shows you an end-to-end example of how you can use the above ONNX Runtime optimizations
+			in your application.
 		</p>
 
 		<h2 class="text-blue-500 text-3xl mb-4">Conclusion</h2>
diff --git a/src/routes/blogs/pytorch-on-the-edge/+page.svelte b/src/routes/blogs/pytorch-on-the-edge/+page.svelte
index d140bc8d7c0c9..6d7f950f513a6 100644
--- a/src/routes/blogs/pytorch-on-the-edge/+page.svelte
+++ b/src/routes/blogs/pytorch-on-the-edge/+page.svelte
@@ -150,10 +150,41 @@ fun run(audioTensor: OnnxTensor): Result {
 		name="description"
 		content="Everything you need to know about running PyTorch models on the edge with ONNX Runtime."
 	/>
+	<meta name="title" content="Run PyTorch models on the edge" />
+	<meta
+		name="keywords"
+		content="PyTorch, ONNX Runtime, edge computing, machine learning, deep learning, model optimization, model deployment, AI on edge"
+	/>
+	<meta name="author" content="Natalie Kershaw, Prasanth Pulavarthi" />
+	<meta name="date" content="2023-10-12" />
+	<meta name="image" content={ORT} />
+	<meta name="robots" content="index, follow" />
+	<meta name="og:title" content="Run PyTorch models on the edge" />
+	<meta
+		name="og:description"
+		content="Learn how to run PyTorch models on edge devices using ONNX Runtime. Explore considerations, tools, and examples for deploying PyTorch models on Windows, in the browser, on mobile, and for on-device training."
+	/>
+	<meta name="og:type" content="article" />
+	<meta name="og:url" content="https://onnxruntime.ai/blogs/pytorch-on-the-edge" />
+	<meta name="og:image" content={ORT} />
+	<meta name="twitter:title" content="Run PyTorch models on the edge" />
+	<meta
+		name="twitter:description"
+		content="Learn how to run PyTorch models on edge devices using ONNX Runtime. Explore considerations, tools, and examples for deploying PyTorch models on Windows, in the browser, on mobile, and for on-device training."
+	/>
+	<meta name="twitter:card" content={ORT} />
+	<meta name="twitter:image" content={ORT} />
 </svelte:head>
 <Header pathvar="" />
 <div class="container mx-auto px-4 md:px-8 lg:px-48 pt-8">
 	<h1 class="text-5xl pb-2">Run PyTorch models on the edge</h1>
+	<p class="text-neutral">
+		By: <a href="https://www.linkedin.com/in/natkershaw/" class="text-blue-500">Natalie Kershaw</a>
+		and
+		<a href="https://www.linkedin.com/in/prasanthpulavarthi/" class="text-blue-500"
+			>Prasanth Pulavarthi</a
+		>
+	</p>
 	<p class="text-neutral">12TH OCTOBER, 2023</p>
 	<div class="py-4">
 		<div class="col-span-12 md:col-span-9">
diff --git a/src/routes/components/code-blocks.svelte b/src/routes/components/code-blocks.svelte
index 32acbe0b70634..e5e1793ef4e2a 100644
--- a/src/routes/components/code-blocks.svelte
+++ b/src/routes/components/code-blocks.svelte
@@ -68,7 +68,7 @@
 				>
 			</div>
 		</div>
-		<div class="hidden lg:block col-span-2 mx-auto tab-container">
+		<div class="hidden lg:block col-span-2 mx-auto min-w-[675px] min-h-[400px]">
 			<div class="tabs">
 				<p
 					on:mouseenter={handleClick}
@@ -160,10 +160,3 @@
 	</div>
 	<div class="divider px-6" />
 </div>
-
-<style>
-	.tab-container {
-		min-width: 675px;
-		min-height: 400px;
-	}
-</style>
diff --git a/src/routes/components/winarm.svelte b/src/routes/components/winarm.svelte
index fa121e6c3ea94..2f1e42f602469 100644
--- a/src/routes/components/winarm.svelte
+++ b/src/routes/components/winarm.svelte
@@ -28,16 +28,12 @@
 			<p class="text-xl text-blue-500">Optimizing models for the NPU</p>
 			<a class="text-blue-500" href="https://onnx.ai/">ONNX</a> is a standard format for representing ML models authored in frameworks like PyTorch,
 			TensorFlow, and others. ONNX Runtime can run any ONNX model, however to make use of the NPU,
-			you currently need to use the following steps:
-			<ol class="list-disc ml-10">
-				<li>Run the tools provided in the SNPE SDK on your model to generate a binary file.</li>
-				<li>Include the contents of the binary file as a node in the ONNX graph.</li>
+			you currently need to quantize the ONNX model to QDQ model.
 				<br>
 				See our <a class="text-blue-500" href="https://github.com/microsoft/onnxruntime-inference-examples/tree/main/c_cxx/QNN_EP/mobilenetv2_classification">C# tutorial</a> for an example of how this is done.
-			</ol>
 			<br />
 			Many models can be optimized for the NPU using this process. Even if a model cannot be optimized
-			for NPU by the SNPE SDK, it can still be run by ONNX Runtime on the CPU.
+			for the NPU, it can still be run by ONNX Runtime on the CPU.
 			<br><br>
 			<p class="text-xl text-blue-500">Getting Help</p>
 			For help with ONNX Runtime, you can <a class="text-blue-500" href="https://github.com/microsoft/onnxruntime/discussions">start a discussion</a> on GitHub or <a class="text-blue-500" href="https://github.com/microsoft/onnxruntime/issues">file an issue</a>.
diff --git a/src/routes/getting-started/table.svelte b/src/routes/getting-started/table.svelte
index 8bbed8f0a34cd..62ba55864589c 100644
--- a/src/routes/getting-started/table.svelte
+++ b/src/routes/getting-started/table.svelte
@@ -669,7 +669,7 @@
 			"Follow build instructions from <a class='text-blue-500' href='http://www.onnxruntime.ai/docs/execution-providers/community-maintained/CANN-ExecutionProvider.html#build' target='_blank'>here</a>.",
 
 		'linux,Python,X64,CANN':
-			"pip install onnxruntime-cann <br/>Refer to <a class='text-blue-500' href='http://www.onnxruntime.ai/docs/execution-providers/community-maintained/community-maintained/CANN-ExecutionProvider.html#requirements' target='_blank'>docs</a> for requirements.",
+			"pip install onnxruntime-cann <br/>Refer to <a class='text-blue-500' href='http://www.onnxruntime.ai/docs/execution-providers/community-maintained/CANN-ExecutionProvider.html#requirements' target='_blank'>docs</a> for requirements.",
 
 		'linux,C-API,X64,CANN':
 			"Follow build instructions from <a class='text-blue-500' href='http://www.onnxruntime.ai/docs/execution-providers/community-maintained/CANN-ExecutionProvider.html#build' target='_blank'>here</a>.",