From 736ed7aeb7ed0eaaec20bc4bc19e599f1606c654 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 13 Jun 2024 15:25:44 -0700 Subject: [PATCH 1/2] [Doc] Fix links in Device Tensor Doc (#21039) --- docs/performance/device-tensor.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/performance/device-tensor.md b/docs/performance/device-tensor.md index 0ddcd8457f1ef..839258a047770 100644 --- a/docs/performance/device-tensor.md +++ b/docs/performance/device-tensor.md @@ -8,7 +8,7 @@ nav_order: 6 Using device tensors can be a crucial part in building efficient AI pipelines, especially on heterogenous memory systems. A typical example of such systems is any PC with a dedicated GPU. -While a [recent GPU](https://www.techpowerup.com/gpu-specs/geforce-rtx-4090.c3889) itself has a memory bandwidth of about 1TB/s, the interconnect [PCI 4.0 x16](https://de.wikipedia.org/wiki/PCI_Express) to the CPU can often be the limiting factor with only ~32GB/s. +While a [recent GPU](https://www.techpowerup.com/gpu-specs/geforce-rtx-4090.c3889) itself has a memory bandwidth of about 1TB/s, the interconnect [PCI 4.0 x16](https://en.wikipedia.org/wiki/PCI_Express) to the CPU can often be the limiting factor with only ~32GB/s. Therefore it is often best to keep data local to the GPU as much as possible or hide slow memory traffic behind computation as the GPU is able to execute compute and PCI memory traffic simultaneously. A typical use case for these scenarios where memory is already local to the inference device would be a GPU accelerated video processing of an encoded video stream which can be decoded with GPU decoders. @@ -20,7 +20,7 @@ Tile based inference for high resolution images is another use-case where custom ## CUDA CUDA in ONNX Runtime has two custom memory types. -`"CudaPinned"` and `"Cuda"` memory where [CUDA pinned](https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/) is actually CPU memory which is directly accesible by the GPU allowing for fully asynchronous up and download of memory using [`cudaMemcpyAsync`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79). +`"CudaPinned"` and `"Cuda"` memory where [CUDA pinned](https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/) is actually CPU memory which is directly accessible by the GPU allowing for fully asynchronous up and download of memory using [`cudaMemcpyAsync`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g85073372f776b4c4d5f89f7124b7bf79). Normal CPU tensors only allow for a synchronous downloads from GPU to CPU while CPU to GPU copies can always be executed asynchronous. Allocating a tensor using the `Ort::Sessions`'s allocator is very straight forward using the [C++ API](https://onnxruntime.ai/docs/api/c/struct_ort_1_1_value.html#a5d35080239ae47cdbc9e505666dc32ec) which directly maps to the C API. @@ -51,7 +51,7 @@ auto ort_value = Ort::Value::CreateTensor( These allocated tensors can then be used as [I/O Binding](../performance/tune-performance/iobinding.md) to eliminate copy ops on the network and move the responsibility to the user. With such IO bindings more performance tunings are possible: - due to the fixed tensor address, a CUDA graph can be captured to reduce CUDA launch latency on CPU -- due to either having fully asynchronous downloads to pinned memory or eliminating memory copies by using device local tensor, CUDA can run [fully asynchronous via a run option](../execution-providers/CUDA-ExecutionProvider.md#performance-Tuning) on its given stream +- due to either having fully asynchronous downloads to pinned memory or eliminating memory copies by using device local tensor, CUDA can run [fully asynchronous via a run option](../execution-providers/CUDA-ExecutionProvider.md#performance-tuning) on its given stream. To set the custom compute stream for CUDA, refer to the V2 option API exposing the `Ort[CUDA|TensorRT]ProviderOptionsV2*`opaque struct pointer and the function `Update[CUDA|TensorRT]ProviderOptionsWithValue(options, "user_compute_stream", cuda_stream);` to set it's stream member. More details can be found in each execution provider doc. @@ -132,5 +132,4 @@ binding.bind_output("out", "dml") # binding.bind_ortvalue_output("out", dml_array_out) session.run_with_iobinding(binding) - ``` \ No newline at end of file From 82bb41a9f63f0ad50b481cdeb13c4212828ce659 Mon Sep 17 00:00:00 2001 From: Maanav Dalal Date: Thu, 13 Jun 2024 16:01:09 -0700 Subject: [PATCH 2/2] Fixed many accessibility issues. (#20990) ### Description Resolving or re-resolving: #20096 , #20118 , #20153 , #20151 , #20152 Done by: - Adding skip to main content links - Adding pause carousel - Fixing H1s - Fixing color scheme - Making code blocks appear on mobile / smaller viewports. ### For testing Please test using mobile and desktop versions, ensuring everything (especially the landing page) appear as expected! --- .../InfiniteMovingCards.svelte | 17 ++++++++- src/routes/+layout.svelte | 2 +- .../blogs/accelerating-llama-2/+page.svelte | 38 +++++++++---------- src/routes/blogs/blog-post-featured.svelte | 2 +- src/routes/blogs/blog-post.svelte | 2 +- src/routes/blogs/post.svelte | 2 +- .../blogs/pytorch-on-the-edge/+page.svelte | 30 +++++++-------- src/routes/components/code-blocks.svelte | 38 ++++++++++++------- src/routes/components/customers.svelte | 2 +- src/routes/components/footer.svelte | 4 +- src/routes/components/header.svelte | 4 +- src/routes/components/hero.svelte | 2 +- src/routes/components/performance.svelte | 2 +- .../components/training-and-inference.svelte | 4 +- src/routes/components/winarm.svelte | 18 ++++----- src/routes/events/event-post.svelte | 2 +- src/routes/getting-started/+page.svelte | 6 +-- src/routes/huggingface/+page.svelte | 30 +++++++-------- src/routes/inference/+page.svelte | 2 +- src/routes/onnx/+page.svelte | 2 +- .../testimonials/testimonial-card.svelte | 2 +- src/routes/training/+page.svelte | 14 +++---- src/routes/windows/+page.svelte | 6 +-- 23 files changed, 130 insertions(+), 101 deletions(-) diff --git a/src/lib/components/ui/InfiniteMovingCards/InfiniteMovingCards.svelte b/src/lib/components/ui/InfiniteMovingCards/InfiniteMovingCards.svelte index ddb822eb9454c..6c56315258661 100644 --- a/src/lib/components/ui/InfiniteMovingCards/InfiniteMovingCards.svelte +++ b/src/lib/components/ui/InfiniteMovingCards/InfiniteMovingCards.svelte @@ -57,14 +57,29 @@ } } }; + + const toggleScroll = () => { + if (scrollerRef) { + const currentState = window.getComputedStyle(scrollerRef).animationPlayState; + scrollerRef.style.animationPlayState = currentState === 'running' ? 'paused' : 'running'; + } + }; + + const handleKeyDown = (event: { key: string; preventDefault: () => void; }) => { + if (event.key === 'Enter' || event.key === ' ') { + event.preventDefault(); // Prevent default spacebar scrolling behavior + toggleScroll(); + } + };
+
    diff --git a/src/routes/+layout.svelte b/src/routes/+layout.svelte index 12545e5526ad9..c6c538bc4e536 100644 --- a/src/routes/+layout.svelte +++ b/src/routes/+layout.svelte @@ -46,7 +46,7 @@
    {/if} {#key data.pathname} -
    +
    {/key} diff --git a/src/routes/blogs/accelerating-llama-2/+page.svelte b/src/routes/blogs/accelerating-llama-2/+page.svelte index c16eeeb5dcafb..5854bfcb489e8 100644 --- a/src/routes/blogs/accelerating-llama-2/+page.svelte +++ b/src/routes/blogs/accelerating-llama-2/+page.svelte @@ -45,11 +45,11 @@

    Accelerating LLaMA-2 Inference with ONNX Runtime

    - By: Kunal Vaishnavi and - Parinita Rahi + Parinita Rahi

    14TH NOVEMBER, 2023 (Updated 22nd November) @@ -70,13 +70,13 @@ quantization updates, and cross-platform usage scenarios.

    -

    Background: Llama2 and Microsoft

    +

    Background: Llama2 and Microsoft

    Llama2 is a state-of-the-art open source LLM from Meta ranging in scale from 7B to 70B parameters (7B, 13B, 70B). Microsoft and Meta announcedannounced their AI on Azure and Windows collaboration in July 2023. As part of the announcement, Llama2 was added to the Azure AI model catalog, which serves as a hub of foundation models that empower developers and machine learning (ML) professionals to easily discover, evaluate, customize, and @@ -89,7 +89,7 @@ your costs.

    -

    +

    Faster Inferencing with New ONNX Runtime Optimizations

    @@ -115,7 +115,7 @@
    Figure 1: E2E Throughput Comparisons
    -

    Latency and Throughput

    +

    Latency and Throughput

    The graphs below show latency comparisons between the ONNX Runtime and PyTorch variants of the @@ -152,11 +152,11 @@

    More details on these metrics can be found herehere.

    -

    ONNX Runtime with Multi-GPU Inference

    +

    ONNX Runtime with Multi-GPU Inference

    ONNX Runtime supports multi-GPU inference to enable serving large models. Even in FP16 @@ -165,7 +165,7 @@

    - ONNX Runtime applied Megatron-LM Tensor Parallelism on the 70B model to split the original model weight onto different GPUs. Megatron @@ -176,7 +176,7 @@ You can find additional example scripts herehere.

    @@ -185,7 +185,7 @@
    Figure 4: 70B Llama2 Model Throughput
    -

    ONNX Runtime Optimizations

    +

    ONNX Runtime Optimizations

    LLaMA-2 Optimization Diagram
    Figure 5: LLaMA-2 Optimization Diagram
    @@ -252,7 +252,7 @@ calculate the rotary embeddings more efficiently with less memory usage. The rotary embedding compute kernels also support interleaved and non-interleaved formats to support both the Microsoft version of LLaMA-2Microsoft version of LLaMA-2 and the Hugging Face version of LLaMA-2 respectively while sharing the same calculations.

    @@ -260,16 +260,16 @@

    The optimizations work for the Hugging Face versionsHugging Face versions (models ending with -hf) and the Microsoft versions. You can download the optimized HF versions from - Microsoft's LLaMA-2 ONNX repository. Stay tuned for newer Microsoft versions coming soon!

    -

    Optimize your own model using Olive

    +

    Optimize your own model using Olive

    Olive is a hardware-aware model optimization tool that incorporates advanced techniques such @@ -281,7 +281,7 @@

    Here is an example of Llama2 optimization with OliveLlama2 optimization with Olive, which harnesses ONNX Runtime optimizations highlighted in this blog. Distinct optimization flows cater to various requirements. For instance, you have the flexibility to choose different data types for quantization in CPU and GPU inference, based on your accuracy @@ -289,17 +289,17 @@ GPUs and perform inference with ONNX Runtime optimizations.

    -

    Usage Example

    +

    Usage Example

    Here is a sample notebooksample notebook that shows you an end-to-end example of how you can use the above ONNX Runtime optimizations in your application.

    -

    Conclusion

    +

    Conclusion

    The advancements discussed in this blog provide faster Llama2 inferencing with ONNX Runtime, diff --git a/src/routes/blogs/blog-post-featured.svelte b/src/routes/blogs/blog-post-featured.svelte index b514fe4b29c9d..15d82a5164860 100644 --- a/src/routes/blogs/blog-post-featured.svelte +++ b/src/routes/blogs/blog-post-featured.svelte @@ -33,7 +33,7 @@

    {title}

    {description}

    {imgalt} -
    +
    {date}
    diff --git a/src/routes/blogs/blog-post.svelte b/src/routes/blogs/blog-post.svelte index a661253f59672..dfc303ae6cb1e 100644 --- a/src/routes/blogs/blog-post.svelte +++ b/src/routes/blogs/blog-post.svelte @@ -30,7 +30,7 @@

    {title}

    {description}

    -

    +

    {date}

    diff --git a/src/routes/blogs/post.svelte b/src/routes/blogs/post.svelte index 1b024eb5b2e40..73f248e7977c4 100644 --- a/src/routes/blogs/post.svelte +++ b/src/routes/blogs/post.svelte @@ -82,7 +82,7 @@

    By:

    {/if} {#each authors as author, i} - {author}{i + 1 === authors.length + {author}{i + 1 === authors.length ? '' : ', '} {/each} diff --git a/src/routes/blogs/pytorch-on-the-edge/+page.svelte b/src/routes/blogs/pytorch-on-the-edge/+page.svelte index 6d7f950f513a6..83ab6d2d49db6 100644 --- a/src/routes/blogs/pytorch-on-the-edge/+page.svelte +++ b/src/routes/blogs/pytorch-on-the-edge/+page.svelte @@ -179,9 +179,9 @@ fun run(audioTensor: OnnxTensor): Result {

    Run PyTorch models on the edge

    - By: Natalie Kershaw + By: Natalie Kershaw and - Prasanth Pulavarthi

    @@ -217,12 +217,12 @@ fun run(audioTensor: OnnxTensor): Result { anywhere that is outside of the cloud, ranging from large, well-resourced personal computers to small footprint devices such as mobile phones. This has been a challenging task to accomplish in the past, but new advances in model optimization and software like - ONNX Runtime + ONNX Runtime make it more feasible - even for new generative AI and large language models like Stable Diffusion, Whisper, and Llama2.

    -

    Considerations for PyTorch models on the edge

    +

    Considerations for PyTorch models on the edge

    There are several factors to keep in mind when thinking about running a PyTorch model on the @@ -292,7 +292,7 @@ fun run(audioTensor: OnnxTensor): Result {

-

Tools for PyTorch models on the edge

+

Tools for PyTorch models on the edge

We mentioned ONNX Runtime several times above. ONNX Runtime is a compact, standards-based @@ -305,7 +305,7 @@ fun run(audioTensor: OnnxTensor): Result { format that doesn't require the PyTorch framework and its gigabytes of dependencies. PyTorch has thought about this and includes an API that enables exactly this - torch.onnxtorch.onnx. ONNX is an open standard that defines the operators that make up models. The PyTorch ONNX APIs take the Pythonic PyTorch code and turn it into a functional graph that captures the operators that are needed to run the model without Python. As with everything @@ -318,7 +318,7 @@ fun run(audioTensor: OnnxTensor): Result { The popular Hugging Face library also has APIs that build on top of this torch.onnx functionality to export models to the ONNX format. Over 130,000 models130,000 models are supported making it very likely that the model you care about is one of them.

@@ -328,7 +328,7 @@ fun run(audioTensor: OnnxTensor): Result { and web browsers) via various languages (from C# to JavaScript to Swift).

-

Examples of PyTorch models on the edge

+

Examples of PyTorch models on the edge

Stable Diffusion on Windows

@@ -345,7 +345,7 @@ fun run(audioTensor: OnnxTensor): Result {

You don't have to export the fifth model, ClipTokenizer, as it is available in ONNX Runtime extensionsONNX Runtime extensions, a library for pre and post processing PyTorch models.

@@ -366,7 +366,7 @@ fun run(audioTensor: OnnxTensor): Result {

You can build the application and run it on Windows with the detailed steps shown in this tutorialtutorial.

@@ -374,7 +374,7 @@ fun run(audioTensor: OnnxTensor): Result {

Running a PyTorch model locally in the browser is not only possible but super simple with - the transformers.js library. Transformers.js uses ONNX Runtime Web as its backend. Many models are already converted to ONNX and served by the tranformers.js CDN, making inference in the browser a matter of writing @@ -407,7 +407,7 @@ fun run(audioTensor: OnnxTensor): Result { All components of the Whisper Tiny model (audio decoder, encoder, decoder, and text sequence generation) can be composed and exported to a single ONNX model using the Olive frameworkOlive framework. To run this model as part of a mobile application, you can use ONNX Runtime Mobile, which supports Android, iOS, react-native, and MAUI/Xamarin.

@@ -420,7 +420,7 @@ fun run(audioTensor: OnnxTensor): Result {

The relevant snippet of a example Android mobile appAndroid mobile app that performs speech transcription on short samples of audio is shown below:

@@ -476,11 +476,11 @@ fun run(audioTensor: OnnxTensor): Result {

You can read the full Speaker Verification tutorialSpeaker Verification tutorial, and build and run the application from sourcebuild and run the application from source.

diff --git a/src/routes/components/code-blocks.svelte b/src/routes/components/code-blocks.svelte index 531d8b9a6ff75..fd51f4292e2dd 100644 --- a/src/routes/components/code-blocks.svelte +++ b/src/routes/components/code-blocks.svelte @@ -7,9 +7,11 @@ import cpp from 'svelte-highlight/languages/cpp'; import FaLink from 'svelte-icons/fa/FaLink.svelte'; import { blur, fade } from 'svelte/transition'; + import { d } from 'svelte-highlight/languages'; + import github from "svelte-highlight/styles/github"; let pythonCode = - 'import onnxruntime as ort\n# Load the model and create InferenceSession\nmodel_path = "path/to/your/onnx/model"\nsession = ort.InferenceSession(model_path)\n# Load and preprocess the input image inputTensor\n...\n# Run inference\noutputs = session.run(None {"input": inputTensor})\nprint(outputs)'; + 'import onnxruntime as ort\n# Load the model and create InferenceSession\nmodel_path = "path/to/your/onnx/model"\nsession = ort.InferenceSession(model_path)\n# "Load and preprocess the input image inputTensor"\n...\n# Run inference\noutputs = session.run(None {"input": inputTensor})\nprint(outputs)'; let csharpCode = 'using Microsoft.ML.OnnxRuntime;\n// Load the model and create InferenceSession\nstring model_path = "path/to/your/onnx/model";\nvar session = new InferenceSession(model_path);\n// Load and preprocess the input image to inputTensor\n...\n// Run inference\nvar outputs = session.Run(inputTensor).ToList();\nConsole.WriteLine(outputs[0].AsTensor()[0]);'; let javascriptCode = @@ -45,30 +47,40 @@ activeTab = tabText; activeTab = activeTab; }; - + let innerWidth = 0 + + + + {@html github} +

Use ONNX Runtime with your favorite language and get started with the tutorials:

-