diff --git a/src/images/blogs/phi-3-on-device_blog_thumbnail.png b/src/images/blogs/phi-3-on-device_blog_thumbnail.png new file mode 100644 index 0000000000000..65767c432f7e5 Binary files /dev/null and b/src/images/blogs/phi-3-on-device_blog_thumbnail.png differ diff --git a/src/routes/blogs/+page.svelte b/src/routes/blogs/+page.svelte index b128ed789e381..93f95d2116059 100644 --- a/src/routes/blogs/+page.svelte +++ b/src/routes/blogs/+page.svelte @@ -15,6 +15,7 @@ import ORT117Thumbnail from '../../images/blogs/ort-1-17-thumbnail.png'; import WebGPUImage from '../../images/blogs/webgpu_blog_thumbnail.jpg'; import WebTrainingImage from '../../images/blogs/webtraining_blog_thumbnail.png'; + import Phi3OnDeviceImage from '../../images/blogs/phi-3-on-device_blog_thumbnail.png'; onMount(() => { anime({ targets: '.border-primary', @@ -42,6 +43,16 @@ dispatch('switchTab', tab); } let featuredblog = [ + { + title: 'Enjoy the Power of Phi-3 with ONNX Runtime on your device', + date: 'May 20th, 2024', + blurb: + "Harness ONNX Runtime to run Phi-3-mini on mobile phones and in the browser.", + link: 'blogs/phi-3-on-device', + image: Phi3OnDeviceImage, + imgalt: + 'Chart comparing model size (in GB) of ONNX Phi-3-mini for web and mobile with original Phi-3-mini' + }, { title: 'ONNX Runtime supports Phi-3 mini models across platforms and devices', date: 'April 22nd, 2024', @@ -61,7 +72,9 @@ image: WebGPUImage, imgalt: 'Comparison of ONNX Runtime Web with WebGPU EP on GPU vs. WASM EP on CPU for segment anything example' - }, + } + ]; + let blogs = [ { title: 'ONNX Runtime 1.17: CUDA 12 support, Phi-2 optimizations, WebGPU, and more!', date: 'February 28th, 2024', @@ -71,9 +84,6 @@ image: ORT117Thumbnail, imgalt: 'ONNX Runtime 1.17 release logo' }, - - ]; - let blogs = [ { title: 'Accelerating Phi-2, CodeLlama, Gemma and other Gen AI models with ONNX Runtime', date: 'February 26th, 2024', diff --git a/src/routes/blogs/phi-3-on-device/+page.svx b/src/routes/blogs/phi-3-on-device/+page.svx new file mode 100644 index 0000000000000..02c4e8b4b301c --- /dev/null +++ b/src/routes/blogs/phi-3-on-device/+page.svx @@ -0,0 +1,76 @@ +--- +title: 'Enjoy the Power of Phi-3 with ONNX Runtime on your device' +date: '20th May, 2024' +description: 'Harness ONNX Runtime to run Phi-3-mini on mobile phones and in the browser.' +keywords: 'ORT, ONNX Runtime, ONNX, machine learning, deep learning, phi 3, phi-3, webgpu, webnn, webassembly, hugging face, huggingface, transformers.js' +authors: + [ + 'Emma Ning (Microsoft)', + 'Scott McKay (Microsoft)', + 'Guenther Schmuelling (Microsoft)', + 'Joshua Lochner (Hugging Face)' + ] +authorsLink: + [ + 'https://www.linkedin.com/in/qiong-ning-21b554236/', + 'https://www.linkedin.com/in/scott-mckay-68b5ab/', + 'https://www.linkedin.com/in/guentherschmuelling/', + 'https://www.linkedin.com/in/xenova/?originalSubdomain=za' + ] +image: '' +url: 'https://onnxruntime.ai/blogs/phi-3-on-device' +--- + +Phi-3 models, a family of open AI models developed by Microsoft, are the most capable and cost-effective small language models (SLMs) available, outperforming models of the same size and next size up across a variety of language, reasoning, coding, and math benchmarks. + +The Phi-3 family consists of different variants by size. [Phi-3-mini](https://arxiv.org/abs/2404.14219), a 3.8 billion parameter language model trained on 3.3 trillion tokens, demonstrates performance comparable to models like Mixtral 8x7B and GPT-3.5. Remarkably, it's compact enough for deployment to client devices. Operating Phi-3-Mini mini offline on client devices opens up numerous opportunities for information retrieval in scenarios with limited connectivity or sensitive privacy concerns. + +Client devices are highly constrained for compute, memory, and power. To bring LLMs to these devices, compressed and optimized models with a lightweight runtime are importantessential. In this blog, we will show how to harness ONNX Runtime to run Phi-3-mini on mobile phones and in the browser. + +# Get compressed and optimized phi-3-mini models + +Quantization involves reducing the precision of a neural network's weights and activations. This technique can notably decrease the model size and inference time, along withas well as reducinge memory and power consumption, albeit with a trade-off involving some level of accuracy drop. Graph fusion is another effective method for achieving high performance by minimizing elementary computations and fully leveraging kernel optimization and hardware acceleration within the runtime. + +We applied 4-bit block quantization for both the web and mobile models. Due to platform and implementation characteristics, there are slight differences in graph fusion and data types in these two versions. For instance, the web model utilizes the Multiheaded Attention fusion, whereas the mobile model utilizes the [Group Query Attention](https://arxiv.org/pdf/2305.13245) fusion. Additionally, the output logits for the web model remains as float32 due to JavaScript's lack of support for float16. The optimized models for mobile and web are notably smaller compared to the original PyTorch version, with a reduction of more than 2.5 times in size. + + + +We've integrated all these ahead of time quantization and optimizations into Olive, a hardware-aware model optimization tool, for allowing users to easily generate compressed and optimized models tailored for various hardware targets. Here's an [Olive example of Phi-3 optimization](https://github.com/microsoft/Olive/tree/main/examples/phi3). Meanwhile, we’ve uploaded the pre-optimized [mobile model](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/cpu_and_mobile) and [web model](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx-web) on to Hugging Face. Feel free to utilize them directly. + +# Run Phi-3 with ONNX Runtime on mobile and Web + +ONNX Runtime is a cross-platform inference engine for machine learning models in the open standard ONNX format. It supports a wide range of hardware platforms and offers various optimizations and acceleration techniques to improve performance and efficiency. Here we focus on its two solutions for on-device inference, : ONNX Runtime mobile and ONNX Runtime Web. Both ONNX Runtime mobile and websolutions also allow customers to customize the build tailor the model for reducing the binary size and the runtime memory footprint. + +## Phi-3 with ONNX Runtime Mobile + +ONNX Runtime mobile Mobile provides a lightweight inference engine that enables easy integration of ONNX models into mobile applications on Android, iOS, reactReact -Nnative, and MAUI/Xamarin. It supports various hardware accelerators tailored for mobile devices, such as the default CPU accelerator and XNNPACK accelerator available across all mobile platforms, as well as platform-specific accelerators like CoreML for iOS and NNAPI, and QNN for Android. For easy integration into different mobile development environments, it also provides a rich set of language bindings, including C, C++, Java, Objective-C, and C#C/C++/Java/Objective-C/C#. + +We leverage ONNX Runtime Mobile default CPU accelerator to inference the 4-bit quantized Phi-3 model on mobile phones. Within ONNX Runtime, optimized ARM64 kernels have been developed to accelerate INT4 quantized matrix multiplication. Additionally, it can accommodate quantized 8-bit integer on inputs, further enhancing performance in INT4 quantized matrix multiplication with acceptable accuracy loss. Moreover, ONNX Runtime includes support for Grouped Query Attention, an operator specifically crafted to efficiently execute attention while enabling past and present KV to share the same buffer. These optimizations collectively enable high-performance CPU execution with ONNX Runtime on mobile devices. Here, we showcase the performance of Phi-3-mini token generation on a Samsung Galaxy S21. ONNX Runtime Mobile surpasses llama.cpp in scenarios with small prompt length and Tokens tokens to generate, while remaining comparable in other cases. + + + +*Note: These performance benchmarks were run using four threads on a Samsung Galaxy S21.* + +
Here is an E2E sample for running Phi-3-mini with ONNX Runtime Mobile on Android. It utilizes the ONNX Runtime Generate() API, which speeds up the inference process for generative AI loops with ONNX models. This includes inference with ONNX Runtime, processing logits, conducting search and sampling, and managing KV cache. Since the official mobile bindings of ONNX Runtime Generate() API is are still in progress, the sample utilizes the C API. It achieves this by creating a JNI wrapper to facilitate communication between the C APIs of the Generative API and Java-level calls. Stay tuned for the update on the official mobile binding of ONNX Runtime Generate() API. For more details on the ONNX Runtime Generate() API, refer to the documentation.
+ + + +ONNX Runtime Web also powers Transformers.js, a library for running Hugging Face transformers directly in the browser. The library serves as a popular choice for developers to run state-of-the-art pretrained models in the browser with as few as 3 lines of code. Utilizing ONNX Runtime Web with WebAssembly backend, Transformers.js has supported numerous models across Natural Language Processing, Vision, Audio, Tabular and Multimodal domains. Now, Hugging Face and Microsoft are collaborating to enable WebGPU support through ONNX Runtime Web in Transformers.js to further accelerate performance, with new demos being released as the team makes progress. The latest demo with Phi-3-mini provides the user with a private (and powerful) chatbot experience. You can try it out on Hugging Face Spaces.
+ + + +