diff --git a/src/images/blogs/ort-1-17-thumbnail.png b/src/images/blogs/ort-1-17-thumbnail.png new file mode 100644 index 0000000000000..e80f7703384f3 Binary files /dev/null and b/src/images/blogs/ort-1-17-thumbnail.png differ diff --git a/src/routes/blogs/+page.svelte b/src/routes/blogs/+page.svelte index 911df71b788c7..5a5d7cbed0e46 100644 --- a/src/routes/blogs/+page.svelte +++ b/src/routes/blogs/+page.svelte @@ -11,6 +11,7 @@ import SDXLTurboImage from '../../images/blogs/sdxl_blog_thumbnail.png'; import Phi2Image from '../../routes/blogs/accelerating-phi-2/Phi2_Int4_TokenGenerationTP.png'; import { createEventDispatcher } from 'svelte'; + import ORT117Thumbnail from '../../images/blogs/ort-1-17-thumbnail.png'; import WebTrainingImage from '../../images/blogs/webtraining_blog_thumbnail.png'; onMount(() => { anime({ @@ -39,6 +40,15 @@ dispatch('switchTab', tab); } let featuredblog = [ + { + title: 'ONNX Runtime 1.17: CUDA 12 support, Phi-2 optimizations, WebGPU, and more!', + date: 'February 28th, 2024', + blurb: + 'From Phi-2 model optimizations to CUDA 12 support, read this post to learn more about some of the exciting new functionality introduced in the ONNX Runtime 1.17 release.', + link: 'blogs/ort-1-17-release', + image: ORT117Thumbnail, + imgalt: 'ONNX Runtime 1.17 release logo' + }, { title: 'Accelerating Phi-2, CodeLlama, Gemma and other Gen AI models with ONNX Runtime', date: 'February 26th, 2024', @@ -56,7 +66,9 @@ link: 'https://cloudblogs.microsoft.com/opensource/2024/02/06/on-device-training-training-a-model-in-browser', image: WebTrainingImage, imgalt: 'Components of the onnxruntime-web JS package' - }, + } + ]; + let blogs = [ { title: 'Accelerating SD Turbo and SDXL Turbo Inference with ONNX Runtime and Olive', date: 'January 15th, 2024', @@ -66,8 +78,6 @@ image: SDXLTurboImage, imgalt: 'SD Turbo and SDXL Turbo models with ONNX Runtime and Olive' }, - ]; - let blogs = [ { title: 'Accelerating LLaMA-2 Inference with ONNX Runtime', date: 'November 14th, 2023', diff --git a/src/routes/blogs/ort-1-17-release/+page.svx b/src/routes/blogs/ort-1-17-release/+page.svx index f8fbaedfb6bd9..fd56326557bb4 100644 --- a/src/routes/blogs/ort-1-17-release/+page.svx +++ b/src/routes/blogs/ort-1-17-release/+page.svx @@ -1,6 +1,6 @@ --- -title: Unlock new functionality with ONNX Runtime 1.17 -date: '2024-02-26' +title: 'ONNX Runtime 1.17: CUDA 12 support, Phi-2 optimizations, WebGPU, and more!' +date: '28th February, 2024' description: 'From Phi-2 model optimizations to CUDA 12 support, read this post to learn more about some of the exciting new functionality introduced in the ONNX Runtime 1.17 release.' keywords: 'ORT, ONNX Runtime, ONNX, machine learning, deep learning, model optimization, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8, pose detection, CUDA 12, GPU, Windows, browser, WebGPU, DirectML, NPU, Phi-2, Mistral, CodeLlama, SDXL-Turbo, on-device training, DirectML, NPU, WebGPU, Yolov8' authors: @@ -8,6 +8,8 @@ authors: 'Sophie Schoenmeyer', 'Parinita Rahi', 'Kshama Pawar', + 'Caroline Zhu', + 'Chad Pralle', 'Emma Ning', 'Natalie Kershaw', 'Jian Chen' @@ -15,8 +17,10 @@ authors: authorsLink: [ 'https://www.linkedin.com/in/sophieschoenmeyer/', - https://www.linkedin.com/in/parinitaparinita/, - 'https://www.linkedin.com/in/kshama-pawar', + 'https://www.linkedin.com/in/parinitaparinita/', + 'https://www.linkedin.com/in/kshama-pawar/', + 'https://www.linkedin.com/in/carzh/', + 'https://www.linkedin.com/in/chadpralle/', '', 'https://www.linkedin.com/in/natkershaw/', '' @@ -25,47 +29,57 @@ image: '' url: 'https://onnxruntime.ai/blogs/ort-1_17-release-blog' --- -# ONNX Runtime 1.17 Release Blog - Recently, we released ONNX Runtime 1.17, which includes a host of new features to further streamline the process of inferencing and training machine learning models across various platforms faster than ever. The release includes improvements to some of our existing features, along with exciting new features like Phi-2 optimizations, training a model in-browser with on-device training, ONNX Runtime Web with WebGPU, and more. -For a complete list of new features, along with various assets, check out the release on GitHub: [ONNX Runtime v1.17.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.17.0). +For a complete list of new features, along with various assets, check out the [1.17 release](https://github.com/microsoft/onnxruntime/releases/tag/v1.17.0) and our recent [1.17.1 patch release](https://github.com/microsoft/onnxruntime/releases/tag/v1.17.1) on GitHub. + +# Model Optimization + +The ONNX Runtime (ORT) 1.17 release provides improved inference performance for several models, such as [Phi-2](https://huggingface.co/microsoft/phi-2), [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1), [CodeLlama](https://huggingface.co/codellama), [Google's Gemma](https://huggingface.co/google/gemma-7b), [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo), and more by using state-of-the-art fusion and kernel optimizations and including support for float16 and int4 quantization. The specific ORT optimizations added in this release are Attention, Multi-Head Attention, Grouped-Query Attention, and Rotary Embedding ORT kernel changes. ORT outperforms other frameworks like PyTorch, DeepSpeed, and Llama.cpp in terms of prompt and token generation throughput, with speedups as high as **20x faster**. In particular, we observe performance improvements as high as **20.5x for Phi-2**, **16.0x for Orca-2**, and **19.8x for Gemma** (see linked blog below for additional details for each model). ONNX Runtime with int4 quantization performs best with batch size 1 due to a special GemV kernel implementation. Overall, ONNX Runtime demonstrates significant performance gains across several batch sizes and prompt lengths. -## Models Optimization +ONNX Runtime also shows significant benefits for training LLMs, and these gains typically increase with batch size. For example, ORT is **1.2x faster** than PyTorch Eager mode and **1.5x faster** than torch.compile for Phi-2 with LoRA on 2 A100 GPUs. ORT also shows benefits for other LLMs, like Llama, Mistral, and Orca-2, with combinations of LoRA or QLoRA. -The ONNX Runtime (ORT) 1.17 release provides improved inference performance for several models, such as [Phi-2](https://huggingface.co/microsoft/phi-2), [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1), [CodeLlama](https://huggingface.co/codellama), and [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo), by using state-of-the-art fusion and kernel optimizations and including support for Float16 and Int4 quantization. The specific ORT optimizations added in this release are Attention, Multi-Head Attention, Grouped-Query Attention, and Rotary Embedding ORT kernel changes. ORT outperforms other frameworks like PyTorch, DeepSpeed, and Llama.cpp in terms of prompt and token generation throughput, with speedups as high as **18.55x** for **Phi-2** with Float16, **20.48x** for Phi-2 with Int4, and **4.1x** for Mistral with Float16 (see linked blogs below for additional details). +To read more about improving generative AI model performance with ONNX Runtime 1.17, check out our recent post on the ONNX Runtime blog: [Accelerating Phi-2, CodeLlama, Gemma and other Gen AI models with ONNX Runtime](https://onnxruntime.ai/blogs/accelerating-phi-2). -ONNX Runtime also shows significant benefits for training LLMs, and these gains typically increase with batch size. For example, ORT is 1.2x faster than PyTorch Eager mode and 1.5x faster than torch.compile for Phi-2 with LoRA on 2 A100 GPUs. ORT also shows benefits for other LLMs, like Llama, Mistral, and Orca-2, with combinations of LoRA or QLoRA. +