diff --git a/docs/execution-providers/OpenVINO-ExecutionProvider.md b/docs/execution-providers/OpenVINO-ExecutionProvider.md index 35f64989e8851..f16d2d79f89fd 100644 --- a/docs/execution-providers/OpenVINO-ExecutionProvider.md +++ b/docs/execution-providers/OpenVINO-ExecutionProvider.md @@ -94,7 +94,7 @@ Enables [OpenCL queue throttling](https://docs.openvino.ai/latest/groupov_runtim ### Model caching -OpenVINO™ supports [model caching](https://docs.openvino.ai/latest/openvino_docs_OV_UG_Model_caching_overview.html). +OpenVINO™ supports [model caching](https://docs.openvino.ai/2023.3/openvino_docs_OV_UG_Model_caching_overview.html). From OpenVINO™ 2023.1 version, model caching feature is supported on CPU, GPU along with kernel caching on iGPU, dGPU. diff --git a/src/images/blogs/mistral_thumbnail.png b/src/images/blogs/mistral_thumbnail.png new file mode 100644 index 0000000000000..0486e556162c6 Binary files /dev/null and b/src/images/blogs/mistral_thumbnail.png differ diff --git a/src/routes/blogs/+page.svelte b/src/routes/blogs/+page.svelte index 2ccf0470ae1cb..5629f2ec96632 100644 --- a/src/routes/blogs/+page.svelte +++ b/src/routes/blogs/+page.svelte @@ -13,6 +13,7 @@ import { createEventDispatcher } from 'svelte'; import ORT117Thumbnail from '../../images/blogs/ort-1-17-thumbnail.png'; import WebGPUImage from '../../images/blogs/webgpu_blog_thumbnail.jpg'; + import MistralImage from '../../images/blogs/mistral_thumbnail.png'; import WebTrainingImage from '../../images/blogs/webtraining_blog_thumbnail.png'; onMount(() => { anime({ @@ -41,6 +42,15 @@ dispatch('switchTab', tab); } let featuredblog = [ + { + title: 'Accelerating Mistral inference with ONNX Runtime and Olive', + date: 'March 11th, 2024', + blurb: + 'Learn how to use ONNX Runtime and Olive to 9X your Mistral model inference!', + link: 'blogs/accelerating-mistral', + image: MistralImage, + imgalt: 'Mistral prompt throughput comparisons for ONNX Runtime FP16 vs. torch.compile and llama.cpp' + }, { title: 'ONNX Runtime Web unleashes generative AI in the browser using WebGPU', date: 'February 29th, 2024', @@ -58,7 +68,9 @@ link: 'blogs/ort-1-17-release', image: ORT117Thumbnail, imgalt: 'ONNX Runtime 1.17 release logo' - }, + } + ]; + let blogs = [ { title: 'Accelerating Phi-2, CodeLlama, Gemma and other Gen AI models with ONNX Runtime', date: 'February 26th, 2024', @@ -67,9 +79,7 @@ link: 'blogs/accelerating-phi-2', image: Phi2Image, imgalt: 'Phi2 float16 token generation throughput comparison' - } - ]; - let blogs = [ + }, { title: 'On-Device Training: Training a model in browser', date: 'February 6th, 2024', diff --git a/src/routes/blogs/accelerating-mistral/+page.svx b/src/routes/blogs/accelerating-mistral/+page.svx new file mode 100644 index 0000000000000..153c7cb042ba2 --- /dev/null +++ b/src/routes/blogs/accelerating-mistral/+page.svx @@ -0,0 +1,74 @@ +--- +title: 'Accelerating Mistral inference with ONNX Runtime and Olive' +date: '11th March, 2024' +description: 'Learn how to use ONNX Runtime and Olive to 9X your Mistral model inference!' +keywords: 'ORT, ONNX Runtime, ONNX, machine learning, deep learning, model optimization, Mistral, Mixtral, MistralAI, Mistral AI' +authors: + [ + 'Sophie Schoenmeyer', + 'Peter Mcaughan' + ] +authorsLink: + [ + 'https://www.linkedin.com/in/sophieschoenmeyer/', + 'https://www.linkedin.com/in/peter-mcaughan/' + ] +image: '' +url: 'https://onnxruntime.ai/blogs/accelerating-mistral' +--- + +# Introduction + +[Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) is a decoder-only LLM with 7B parameters that is accessible to open-source developers and reported to outperform ([Llama2-13B](https://huggingface.co/meta-llama/Llama-2-7b), [Vicuna-13B](https://huggingface.co/lmsys/vicuna-13b-v1.5)) or closely match ([WizardLM-13B](https://huggingface.co/WizardLM/WizardLM-13B-V1.2)) models twice its size. With [ONNX Runtime](https://github.com/microsoft/onnxruntime), users can speed up Mistral inference significantly, and [Olive](https://github.com/microsoft/Olive) makes the model optimization process easier than ever. + +# Usage instructions + +You can try out these optimizations yourself in Olive for FP16 models running on GPU using only one line of code: + +``` +python mistral.py --optimize --config mistral_fp16_optimize.json +``` + +To test inference, run the script with `--inference` as follows: + +``` +CUDA_VISIBLE_DEVICES=6 python mistral.py --inference +``` + +For a complete list of instructions, check out the Mistral example in the Olive GitHub repository [here](https://github.com/microsoft/Olive/tree/main/examples/mistral). + +Our optimized version of Mistral is also directly available on Hugging Face under the Microsoft username [here](https://huggingface.co/microsoft/Mistral-7B-v0.1-onnx). + +# Benchmark results + +We benchmarked Mistral with Standard ND96amsr A100 v4 VM using NVidia A100 GPU and FP16 precision. The results are measured using these specifications: + +- **torch:** 2.2.0 +- **triton:** 2.2.0 +- **onnxruntime-gpu:** 1.17.0 +- **llama.cpp:** commit 594fca3fefe27b8e95cfb1656eb0e160ad15a793 + +To reproduce these results, we recommend using Olive, as outlined in the “Usage instructions” section above. + +The graphs below illustrate the speedup ratios of throughput for ONNX Runtime FP16 vs. torch.compile and llama.cpp for different (batch size, sequence length) combinations. These results were measured for both prompt and token generation. + +Mistral prompt throughput comparisons for ONNX Runtime FP16 vs. torch.compile and llama.cpp + +With FP16, ONNX Runtime prompt throughput is up to **9.46x faster** than llama.cpp prompt throughput and up to **4.81x faster** than torch.compile prompt throughput. + +Mistral token generation throughput comparisons for ONNX Runtime FP16 vs. torch.compile and llama.cpp + +With FP16, ONNX Runtime token generation throughput is up to **5.79x faster** than llama.cpp token generation throughput and up to **4.95x faster** than torch.compile token generation throughput. + +# Key features + +The following features were optimized in the ONNX Runtime library and applied with Olive to yield the Mistral results outlined in this post: + +- **Grouped Query Attention** – This feature is supported in the Flash Attention implementation of ONNX Runtime from our previous Llama2 optimization. +- **Sliding Window Attention** – The ONNX Runtime implementation of Flash Attention was modified to include the window_size parameter, which provided support for this feature. + +Since the Mistral model architecture is so similar to that of Llama2, we recommend reading our recent [Accelerating LLaMA-2 Inference with ONNX Runtime](https://onnxruntime.ai/blogs/accelerating-llama-2) blog post to learn more about these optimizations. + +# Coming soon + +ONNX Runtime optimizations for Mistral AI’s recent [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) model are currently in progress. We are planning to publish another blog post with more information about these optimizations, along with the corresponding performance improvements, soon. \ No newline at end of file diff --git a/src/routes/blogs/accelerating-mistral/mistral_prompt_throughput.png b/src/routes/blogs/accelerating-mistral/mistral_prompt_throughput.png new file mode 100644 index 0000000000000..4b278a68a17f3 Binary files /dev/null and b/src/routes/blogs/accelerating-mistral/mistral_prompt_throughput.png differ diff --git a/src/routes/blogs/accelerating-mistral/mistral_token_throughput.png b/src/routes/blogs/accelerating-mistral/mistral_token_throughput.png new file mode 100644 index 0000000000000..f5514a414ace7 Binary files /dev/null and b/src/routes/blogs/accelerating-mistral/mistral_token_throughput.png differ