diff --git a/src/images/blogs/ort-1-17-thumbnail.png b/src/images/blogs/ort-1-17-thumbnail.png new file mode 100644 index 0000000000000..e80f7703384f3 Binary files /dev/null and b/src/images/blogs/ort-1-17-thumbnail.png differ diff --git a/src/routes/blogs/+page.svelte b/src/routes/blogs/+page.svelte index 911df71b788c7..5a5d7cbed0e46 100644 --- a/src/routes/blogs/+page.svelte +++ b/src/routes/blogs/+page.svelte @@ -11,6 +11,7 @@ import SDXLTurboImage from '../../images/blogs/sdxl_blog_thumbnail.png'; import Phi2Image from '../../routes/blogs/accelerating-phi-2/Phi2_Int4_TokenGenerationTP.png'; import { createEventDispatcher } from 'svelte'; + import ORT117Thumbnail from '../../images/blogs/ort-1-17-thumbnail.png'; import WebTrainingImage from '../../images/blogs/webtraining_blog_thumbnail.png'; onMount(() => { anime({ @@ -39,6 +40,15 @@ dispatch('switchTab', tab); } let featuredblog = [ + { + title: 'ONNX Runtime 1.17: CUDA 12 support, Phi-2 optimizations, WebGPU, and more!', + date: 'February 28th, 2024', + blurb: + 'From Phi-2 model optimizations to CUDA 12 support, read this post to learn more about some of the exciting new functionality introduced in the ONNX Runtime 1.17 release.', + link: 'blogs/ort-1-17-release', + image: ORT117Thumbnail, + imgalt: 'ONNX Runtime 1.17 release logo' + }, { title: 'Accelerating Phi-2, CodeLlama, Gemma and other Gen AI models with ONNX Runtime', date: 'February 26th, 2024', @@ -56,7 +66,9 @@ link: 'https://cloudblogs.microsoft.com/opensource/2024/02/06/on-device-training-training-a-model-in-browser', image: WebTrainingImage, imgalt: 'Components of the onnxruntime-web JS package' - }, + } + ]; + let blogs = [ { title: 'Accelerating SD Turbo and SDXL Turbo Inference with ONNX Runtime and Olive', date: 'January 15th, 2024', @@ -66,8 +78,6 @@ image: SDXLTurboImage, imgalt: 'SD Turbo and SDXL Turbo models with ONNX Runtime and Olive' }, - ]; - let blogs = [ { title: 'Accelerating LLaMA-2 Inference with ONNX Runtime', date: 'November 14th, 2023', diff --git a/src/routes/blogs/ort-1-17-release/+page.svx b/src/routes/blogs/ort-1-17-release/+page.svx index f8fbaedfb6bd9..fd56326557bb4 100644 --- a/src/routes/blogs/ort-1-17-release/+page.svx +++ b/src/routes/blogs/ort-1-17-release/+page.svx @@ -1,6 +1,6 @@ --- -title: Unlock new functionality with ONNX Runtime 1.17 -date: '2024-02-26' +title: 'ONNX Runtime 1.17: CUDA 12 support, Phi-2 optimizations, WebGPU, and more!' +date: '28th February, 2024' description: 'From Phi-2 model optimizations to CUDA 12 support, read this post to learn more about some of the exciting new functionality introduced in the ONNX Runtime 1.17 release.' keywords: 'ORT, ONNX Runtime, ONNX, machine learning, deep learning, model optimization, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8' authors: @@ -8,6 +8,8 @@ authors: 'Sophie Schoenmeyer', 'Parinita Rahi', 'Kshama Pawar', + 'Caroline Zhu', + 'Chad Pralle', 'Emma Ning', 'Natalie Kershaw', 'Jian Chen' @@ -15,8 +17,10 @@ authors: authorsLink: [ 'https://www.linkedin.com/in/sophieschoenmeyer/', - https://www.linkedin.com/in/parinitaparinita/, - 'https://www.linkedin.com/in/kshama-pawar', + 'https://www.linkedin.com/in/parinitaparinita/', + 'https://www.linkedin.com/in/kshama-pawar/', + 'https://www.linkedin.com/in/carzh/', + 'https://www.linkedin.com/in/chadpralle/', '', 'https://www.linkedin.com/in/natkershaw/', '' @@ -25,47 +29,57 @@ image: '' url: 'https://onnxruntime.ai/blogs/ort-1_17-release-blog' --- -# ONNX Runtime 1.17 Release Blog - Recently, we released ONNX Runtime 1.17, which includes a host of new features to further streamline the process of inferencing and training machine learning models across various platforms faster than ever. The release includes improvements to some of our existing features, along with exciting new features like Phi-2 optimizations, training a model in-browser with on-device training, ONNX Runtime Web with WebGPU, and more. -For a complete list of new features, along with various assets, check out the release on GitHub: [ONNX Runtime v1.17.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.17.0). +For a complete list of new features, along with various assets, check out the [1.17 release](https://github.com/microsoft/onnxruntime/releases/tag/v1.17.0) and our recent [1.17.1 patch release](https://github.com/microsoft/onnxruntime/releases/tag/v1.17.1) on GitHub. + +# Model Optimization + +The ONNX Runtime (ORT) 1.17 release provides improved inference performance for several models, such as [Phi-2](https://huggingface.co/microsoft/phi-2), [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1), [CodeLlama](https://huggingface.co/codellama), [Google's Gemma](https://huggingface.co/google/gemma-7b), [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo), and more by using state-of-the-art fusion and kernel optimizations and including support for float16 and int4 quantization. The specific ORT optimizations added in this release are Attention, Multi-Head Attention, Grouped-Query Attention, and Rotary Embedding ORT kernel changes. ORT outperforms other frameworks like PyTorch, DeepSpeed, and Llama.cpp in terms of prompt and token generation throughput, with speedups as high as **20x faster**. In particular, we observe performance improvements as high as **20.5x for Phi-2**, **16.0x for Orca-2**, and **19.8x for Gemma** (see linked blog below for additional details for each model). ONNX Runtime with int4 quantization performs best with batch size 1 due to a special GemV kernel implementation. Overall, ONNX Runtime demonstrates significant performance gains across several batch sizes and prompt lengths. -## Models Optimization +ONNX Runtime also shows significant benefits for training LLMs, and these gains typically increase with batch size. For example, ORT is **1.2x faster** than PyTorch Eager mode and **1.5x faster** than torch.compile for Phi-2 with LoRA on 2 A100 GPUs. ORT also shows benefits for other LLMs, like Llama, Mistral, and Orca-2, with combinations of LoRA or QLoRA. -The ONNX Runtime (ORT) 1.17 release provides improved inference performance for several models, such as [Phi-2](https://huggingface.co/microsoft/phi-2), [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1), [CodeLlama](https://huggingface.co/codellama), and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo), by using state-of-the-art fusion and kernel optimizations and including support for Float16 and Int4 quantization. The specific ORT optimizations added in this release are Attention, Multi-Head Attention, Grouped-Query Attention, and Rotary Embedding ORT kernel changes. ORT outperforms other frameworks like PyTorch, DeepSpeed, and Llama.cpp in terms of prompt and token generation throughput, with speedups as high as **18.55x** for **Phi-2** with Float16, **20.48x** for Phi-2 with Int4, and **4.1x** for Mistral with Float16 (see linked blogs below for additional details). +To read more about improving generative AI model performance with ONNX Runtime 1.17, check out our recent post on the ONNX Runtime blog: [Accelerating Phi-2, CodeLlama, Gemma and other Gen AI models with ONNX Runtime](https://onnxruntime.ai/blogs/accelerating-phi-2). -ONNX Runtime also shows significant benefits for training LLMs, and these gains typically increase with batch size. For example, ORT is 1.2x faster than PyTorch Eager mode and 1.5x faster than torch.compile for Phi-2 with LoRA on 2 A100 GPUs. ORT also shows benefits for other LLMs, like Llama, Mistral, and Orca-2, with combinations of LoRA or QLoRA. +
+Phi-2 int4 token generation throughput comparison -To read more about accelerating Phi-2, Mistral, CodeLlama, SDXL-Turbo, and more with ONNX Runtime 1.17, check out this recent post on the ONNX Runtime blog: **_Phi-2 newsletter link_**. +Orca-2 int4 token generation throughput comparison -## On-Device Training +CodeLlama int4 token generation throughput comparison + +Gemma int4 token generation throughput comparison +
+ +# In-Browser Training On-device training allows you to improve the user experience for developer applications using device data. It supports scenarios like federated learning, which trains a global model using data on the device. With the 1.17 release, ORT will now enable training machine learning models in the browser using on-device training. To learn more about training a model in browser with on-device training, check out this recent post on the Microsoft Open Source Blog: [On-Device Training: Training a model in browser](https://cloudblogs.microsoft.com/opensource/2024/02/06/on-device-training-training-a-model-in-browser/). -## DirectML NPU Support +Diagram of the components of the onnxruntime-web JS package + +# DirectML NPU Support With the release of [DirectML 1.13.1](https://github.com/microsoft/DirectML/blob/master/Releases.md) and ONNX Runtime 1.17, developer preview support for neural processing unit (NPU) acceleration is now available in DirectML, the machine learning platform API for Windows. This developer preview enables support for a subset of models on new Windows 11 devices with Intel® Core™ Ultra processors with Intel® AI boost. To learn more about NPU support in DirectML, check out this recent post on the Windows Developer Blog: [Introducing Neural Processor Unit (NPU) support in DirectML (developer preview)](https://blogs.windows.com/windowsdeveloper/2024/02/01/introducing-neural-processor-unit-npu-support-in-directml-developer-preview/). -## ONNX Runtime Web with WebGPU +# WebGPU with ONNX Runtime Web WebGPU enables web developers to harness GPU hardware for high-performance computations. The ONNX Runtime 1.17 release introduces the official launch of the WebGPU execution provider in ONNX Runtime Web, allowing sophisticated models to run entirely and efficiently within the browser (see the [list of WebGPU browser compatibility](https://github.com/gpuweb/gpuweb/wiki/Implementation-Status)). This advancement, demonstrated by the effective execution of models such as SD-Turbo, unlocks new possibilities in scenarios where CPU-based in-browser machine learning faces challenges in meeting performance standards. To learn more about how ONNX Runtime Web further accelerates in-browser machine learning with WebGPU, stay tuned for our upcoming blog post. -## Yolov8 Pose Detection Scenario +# YOLOv8 Pose Estimation Scenario with ONNX Runtime Mobile -This release adds support for running the Yolov8 model for pose detection. Pose detection involves processing the objects detected in an image and identifying the position and orientation of people in the image. The core Yolov8 model returns a set of key points, representing specific parts of the detected person's body, such as joints and other distinctive features. Including the pre- and post-processing in the ONNX model allows developers to supply an input image directly, either in common image formats or raw RGB values, and output the image with bounding boxes and key points. +This release adds support for running the YOLOv8 model for pose estimation. Pose estimation involves processing the objects detected in an image and identifying the position and orientation of people in the image. The core YOLOv8 model returns a set of key points, representing specific parts of the detected person's body, such as joints and other distinctive features. Including the pre- and post-processing in the ONNX model allows developers to supply an input image directly, either in common image formats or raw RGB values, and output the image with bounding boxes and key points. -**_TODO: Add output image_** +To learn more about how to build and run ONNX models on mobile with built-in pre and post processing for object detection and pose estimation, check out our recent tutorial in the ONNX Runtime documentation: [Object detection and pose estimation with YOLOv8](https://onnxruntime.ai/docs/tutorials/mobile/pose-detection.html). -**_TODO: Add link_** +Person with pose drawn -## CUDA 12 packages - Jian +# CUDA 12 Packages As part of the 1.17 release, ONNX Runtime now ensures compatibility across multiple versions of Nvidia's CUDA execution provider by introducing CUDA 12 packages for Python and NuGet. With this more flexible methodology, users will now have access to both CUDA 11 and CUDA 12, allowing for more seamless integration of cutting-edge hardware acceleration technologies. diff --git a/src/routes/blogs/ort-1-17-release/in_browser_training.jpg b/src/routes/blogs/ort-1-17-release/in_browser_training.jpg new file mode 100644 index 0000000000000..da5942daebb49 Binary files /dev/null and b/src/routes/blogs/ort-1-17-release/in_browser_training.jpg differ diff --git a/src/routes/blogs/ort-1-17-release/model_optimization_chart.jpg b/src/routes/blogs/ort-1-17-release/model_optimization_chart.jpg new file mode 100644 index 0000000000000..be5f30005df6a Binary files /dev/null and b/src/routes/blogs/ort-1-17-release/model_optimization_chart.jpg differ diff --git a/src/routes/blogs/ort-1-17-release/yolov8_pose_estimation.png b/src/routes/blogs/ort-1-17-release/yolov8_pose_estimation.png new file mode 100644 index 0000000000000..0e9473489bf9a Binary files /dev/null and b/src/routes/blogs/ort-1-17-release/yolov8_pose_estimation.png differ