From ecb3791b354a1bf4b0479fe642e75d4cf09b79ae Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Tue, 5 Nov 2024 12:48:23 +0900 Subject: [PATCH] add release notes to 2.5.0 (#3360) --- .../_sources/tutorials/examples.md.txt | 2 +- .../tutorials/features/fast_bert.md.txt | 4 +- .../_sources/tutorials/getting_started.md.txt | 5 +- .../_sources/tutorials/installation.md.txt | 4 +- .../_sources/tutorials/introduction.rst.txt | 2 +- .../_sources/tutorials/releases.md.txt | 27 ++ .../_static/htmls/tbl_deepspeed.html | 28 +- cpu/2.5.0+cpu/_static/htmls/tbl_single.html | 120 +++---- cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html | 2 +- cpu/2.5.0+cpu/genindex.html | 2 +- cpu/2.5.0+cpu/index.html | 2 +- cpu/2.5.0+cpu/py-modindex.html | 2 +- cpu/2.5.0+cpu/search.html | 2 +- cpu/2.5.0+cpu/searchindex.js | 2 +- cpu/2.5.0+cpu/tutorials/api_doc.html | 2 +- .../tutorials/blogs_publications.html | 2 +- cpu/2.5.0+cpu/tutorials/cheat_sheet.html | 2 +- cpu/2.5.0+cpu/tutorials/contribution.html | 2 +- cpu/2.5.0+cpu/tutorials/examples.html | 4 +- cpu/2.5.0+cpu/tutorials/features.html | 2 +- cpu/2.5.0+cpu/tutorials/features/amp.html | 2 +- .../features/auto_channels_last.html | 2 +- .../features/codeless_optimization.html | 2 +- .../tutorials/features/fast_bert.html | 6 +- .../tutorials/features/graph_capture.html | 2 +- .../features/graph_optimization.html | 2 +- .../tutorials/features/hypertune.html | 2 +- .../tutorials/features/int8_overview.html | 2 +- .../features/int8_recipe_tuning_api.html | 2 +- .../features/isa_dynamic_dispatch.html | 2 +- cpu/2.5.0+cpu/tutorials/features/nhwc.html | 2 +- .../tutorials/features/optimizer_fusion.html | 2 +- .../tutorials/features/runtime_extension.html | 2 +- .../tutorials/features/split_sgd.html | 2 +- .../features/sq_recipe_tuning_api.html | 2 +- cpu/2.5.0+cpu/tutorials/getting_started.html | 6 +- cpu/2.5.0+cpu/tutorials/installation.html | 6 +- cpu/2.5.0+cpu/tutorials/introduction.html | 4 +- cpu/2.5.0+cpu/tutorials/known_issues.html | 2 +- cpu/2.5.0+cpu/tutorials/license.html | 2 +- cpu/2.5.0+cpu/tutorials/llm.html | 150 +++++---- cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html | 2 +- cpu/2.5.0+cpu/tutorials/performance.html | 2 +- .../performance_tuning/launch_script.html | 2 +- .../performance_tuning/torchserve.html | 2 +- .../performance_tuning/tuning_guide.html | 2 +- cpu/2.5.0+cpu/tutorials/releases.html | 299 ++++++++++-------- 47 files changed, 417 insertions(+), 314 deletions(-) diff --git a/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt index 27a737e86..809d9046d 100644 --- a/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt +++ b/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt @@ -502,7 +502,7 @@ print("Execution finished") import torch from transformers import BertModel -model = BertModel.from_pretrained("bert-base-uncased") +model = BertModel.from_pretrained("bert-base-uncased", attn_implementation="eager") model.eval() vocab_size = model.config.vocab_size diff --git a/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt index d0621131f..848fd307d 100644 --- a/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt +++ b/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt @@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc ### Prerequisite -- Transformers 4.6.0 ~ 4.43.2 +- Transformers 4.6.0 ~ 4.45.0 ### Usage Example @@ -20,7 +20,7 @@ An API `ipex.fast_bert` is provided for a simple usage. Usage of this API follow import torch from transformers import BertModel -model = BertModel.from_pretrained("bert-base-uncased") +model = BertModel.from_pretrained("bert-base-uncased", attn_implementation="eager") model.eval() vocab_size = model.config.vocab_size diff --git a/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt index 67874f6d4..ee11aabc7 100644 --- a/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt +++ b/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt @@ -1,6 +1,6 @@ # Quick Start -The following instructions assume you have installed the Intel® Extension for PyTorch\*. For installation instructions, refer to [Installation](../../../index.html#installation?platform=cpu&version=main). +The following instructions assume you have installed the Intel® Extension for PyTorch\*. For installation instructions, refer to [Installation](../../../index.html#installation?platform=cpu&version=v2.5.0%2Bcpu). To start using the Intel® Extension for PyTorch\* in your code, you need to make the following changes: @@ -64,7 +64,6 @@ In [Cheat Sheet](./cheat_sheet.md), you can find more commands that can help you `ipex.llm.optimize` is used for Large Language Models (LLM). - ```python import torch #################### code changes #################### @@ -157,4 +156,4 @@ with torch.inference_mode(), torch.cpu.amp.autocast(enabled=amp_enabled): print(gen_text, total_new_tokens, flush=True) ``` -More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) section. +More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/release/2.5/examples/cpu/llm) section. diff --git a/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt index 707a091db..567dd2d38 100644 --- a/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt +++ b/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt @@ -1,8 +1,8 @@ Installation ============ -Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.4.0%2Bcpu). +Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.5.0%2Bcpu). After successful installation, refer to the [Quick Start](getting_started.md) and [Examples](examples.md) sections to start using the extension in your code. -**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm). +**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.5.0%2Bcpu/examples/cpu/llm). diff --git a/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt b/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt index 8037db666..1c4309dfb 100644 --- a/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt +++ b/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt @@ -16,7 +16,7 @@ the `Large Language Models (LLM) `_ section. Get Started ----------- -- `Installation <../../../index.html#installation?platform=cpu&version=v2.4.0%2Bcpu>`_ +- `Installation <../../../index.html#installation?platform=cpu&version=v2.5.0%2Bcpu>`_ - `Quick Start `_ - `Examples `_ diff --git a/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt index 3ee67a92e..70c8fbed0 100644 --- a/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt +++ b/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt @@ -1,6 +1,33 @@ Releases ======== +## 2.5.0 + +We are excited to announce the release of Intel® Extension for PyTorch* 2.5.0+cpu which accompanies PyTorch 2.5. This release mainly brings you the support for Llama3.2, optimization on newly launched Intel® Xeon® 6 P-core platform, GPTQ/AWQ format support, and latest optimization to push better performance for LLM models. This release also includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product. + +### Highlights + +* Llama 3.2 support + +Meta has newly released [Llama 3.2](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), which includes small and medium-sized vision LLMs (11B and 90B), and lightweight, text-only models (1B and 3B). Intel® Extension for PyTorch* provides [support of Llama 3.2](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-ai-solutions-support-the-new-llama-3-2-model.html) since its launch date with early release version, and now support with this official release. + +* Optimization for Intel® Xeon® 6 +Intel® Xeon® 6 deliver new degrees of performance with more cores, a choice of microarchitecture, additional memory bandwidth, and exceptional input/output (I/O) across a range of workloads. Intel® Extension for PyTorch* provides dedicated optimization on this new processor family for features like Multiplexed Rank DIMM (MRDIMM), SNC=3 scenario, etc.. + +* Large Language Model (LLM) optimization: +Intel® Extension for PyTorch* provides more feature support of the weight only quantization including GPTQ/AWQ format support, symmetric quantization of activation and weight, and added chunked prefill/prefix prefill support in LLM module API, etc.. These features enable better adoption of community model weight and provides better performance for low-precision scenarios. This release also extended the optimized models to include newly published Llama 3.2 vision models. A full list of optimized models can be found at [LLM optimization](https://github.com/intel/intel-extension-for-pytorch/tree/v2.5.0+cpu/examples/cpu/llm/inference). + +* Bug fixing and other optimization + - Optimized the performance of the IndirectAccessKVCacheAttention kernel +[#3185](https://github.com/intel/intel-extension-for-pytorch/commit/8572e1faf97998783ea2a7fc6ee3094090feebc4) [#3209](https://github.com/intel/intel-extension-for-pytorch/commit/65e96630a2e17f7b762c5c765f10264ad08db098) [#3214](https://github.com/intel/intel-extension-for-pytorch/commit/a04214f7ab4e43648d75abdcf0fae53e5076be2b) [#3218](https://github.com/intel/intel-extension-for-pytorch/commit/f219012ab1babbc67c9b545fa7251cd981a2a3a2) [#3248](https://github.com/intel/intel-extension-for-pytorch/commit/9f6178eb028d36b3ed1f5985e57b7cf160acf38a) + - Fixed the Segmentation fault in the IndirectAccessKVCacheAttention kernel [#3246](https://github.com/intel/intel-extension-for-pytorch/commit/bee5ab644086c9b25eb61916c6773932c74667d3) + - Fixed the correctness issue in the PagedAttention kernel for Llama-68M-Chat-v1 [#3307](https://github.com/intel/intel-extension-for-pytorch/commit/638a7d26acb33af450ea9869b5b43ccdbe0e962b) + - Fixed the support in `ipex.llm.optimize` to ensure `model.generate` returns the correct output type when `return_dict_in_generate` is set to `True`. [#3333](https://github.com/intel/intel-extension-for-pytorch/commit/584a4e2e2c6193b926554f951d2608489cac5d7a) + - Optimized the performance of the Flash Attention kernel [#3291](https://github.com/intel/intel-extension-for-pytorch/commit/8fb43ec45ed93b62efef07f4b2e8dcd7dd502b8b) + - Upgraded oneDNN to v3.6 [#3305](https://github.com/intel/intel-extension-for-pytorch/commit/91639fa0812ee3c12c672002c2bf5cf1cac4bc0a) + +**Full Changelog**: https://github.com/intel/intel-extension-for-pytorch/compare/v2.4.0+cpu...v2.5.0+cpu + ## 2.4.0 We are excited to announce the release of Intel® Extension for PyTorch\* 2.4.0+cpu which accompanies PyTorch 2.4. This release mainly brings you the support for Llama3.1, basic support for LLM serving frameworks like vLLM/TGI, and a set of optimization to push better performance for LLM models. This release also extends the list of optimized LLM models to a broader level and includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product. diff --git a/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html b/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html index 2751ef96a..15fc0edf3 100644 --- a/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html +++ b/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html @@ -44,6 +44,18 @@

🟩

🟩

+ +

LLAMA

+

meta-llama/Llama-3.2-3B-Instruct

+

🟩

+

🟩

+ + +

LLAMA

+

meta-llama/Llama-3.2-11B-Vision-Instruct

+

🟩

+

🟩

+

GPT-J

EleutherAI/gpt-j-6b

@@ -53,13 +65,13 @@

GPT-NEOX

EleutherAI/gpt-neox-20b

-

🟨

+

🟩

🟩

DOLLY

databricks/dolly-v2-12b

-

🟨

+

🟩

🟩

@@ -77,7 +89,7 @@

OPT

facebook/opt-30b

-

🟨

+

🟩

🟩

@@ -89,7 +101,7 @@

Bloom

bigscience/bloom-1b7

-

🟨

+

🟩

🟩

@@ -113,7 +125,7 @@

Baichuan

baichuan-inc/Baichuan-13B-Chat

-

🟨

+

🟩

🟩

@@ -207,8 +219,4 @@

🟩

- -
    -
  • 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

  • -
  • 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

  • -
\ No newline at end of file + \ No newline at end of file diff --git a/cpu/2.5.0+cpu/_static/htmls/tbl_single.html b/cpu/2.5.0+cpu/_static/htmls/tbl_single.html index 4ad7a2284..ef5f9c86a 100644 --- a/cpu/2.5.0+cpu/_static/htmls/tbl_single.html +++ b/cpu/2.5.0+cpu/_static/htmls/tbl_single.html @@ -16,9 +16,9 @@

meta-llama/Llama-2-7b-hf

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

LLAMA

@@ -43,16 +43,16 @@

meta-llama/Meta-Llama-3-8B

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

LLAMA

meta-llama/Meta-Llama-3-70B

🟩

🟩

-

🟨

+

🟩

🟩

🟩

@@ -61,10 +61,28 @@

meta-llama/Meta-Llama-3.1-8B-Instruct

🟩

🟩

-

🟨

+

🟩

🟩

🟩

+ +

LLAMA

+

meta-llama/Llama-3.2-3B-Instruct

+

🟩

+

🟩

+

🟩

+

🟩

+

🟩

+ + +

LLAMA

+

meta-llama/Llama-3.2-11B-Vision-Instruct

+

🟩

+

🟩

+

+

🟩

+

+

GPT-J

EleutherAI/gpt-j-6b

@@ -78,19 +96,19 @@

GPT-NEOX

EleutherAI/gpt-neox-20b

🟩

-

🟨

-

🟨

🟩

-

🟨

+

🟩

+

🟩

+

🟩

DOLLY

databricks/dolly-v2-12b

🟩

-

🟨

-

🟨

🟩

-

🟨

+

🟩

+

🟩

+

🟩

FALCON

@@ -99,7 +117,7 @@

🟩

🟩

🟩

-

+

🟩

FALCON

@@ -108,7 +126,7 @@

🟩

🟩

🟩

-

🟨

+

🟩

FALCON

@@ -126,7 +144,7 @@

🟩

🟩

🟩

-

🟨

+

🟩

OPT

@@ -135,16 +153,16 @@

🟩

🟩

🟩

-

🟨

+

🟩

Bloom

bigscience/bloom-1b7

🟩

-

🟨

🟩

🟩

-

🟨

+

🟩

+

🟩

CodeGen

@@ -162,34 +180,34 @@

🟩

🟩

🟩

-

🟨

+

🟩

Baichuan

baichuan-inc/Baichuan2-13B-Chat

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Baichuan

baichuan-inc/Baichuan-13B-Chat

🟩

-

🟨

🟩

🟩

-

🟨

+

🟩

+

🟩

ChatGLM

THUDM/chatglm3-6b

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

ChatGLM

@@ -198,23 +216,23 @@

🟩

🟩

🟩

-

🟨

+

🟩

GPTBigCode

bigcode/starcoder

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

T5

google/flan-t5-xl

🟩

🟩

-

🟨

+

🟩

🟩

@@ -232,9 +250,9 @@

mistralai/Mistral-7B-v0.1

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Mixtral

@@ -243,34 +261,34 @@

🟩

🟩

-

🟨

+

🟩

Stablelm

stabilityai/stablelm-2-1_6b

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Qwen

Qwen/Qwen-7B-Chat

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Qwen

Qwen/Qwen2-7B

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

LLaVA

@@ -296,7 +314,7 @@

🟩

🟩

-

🟨

+

🟩

@@ -306,43 +324,43 @@

🟩

🟩

🟩

-

🟨

+

🟩

Phi

microsoft/Phi-3-mini-4k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Phi

microsoft/Phi-3-mini-128k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Phi

microsoft/Phi-3-medium-4k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Phi

microsoft/Phi-3-medium-128k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Whisper

@@ -354,8 +372,4 @@

- -
    -
  • 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

  • -
  • 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

  • -
\ No newline at end of file + \ No newline at end of file diff --git a/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html b/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html index 7b6547cb6..69d9f100f 100644 --- a/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html +++ b/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html @@ -125,7 +125,7 @@

Intel® Extension for PyTorch* CPU ISA Dynamic Dispatch Design DocSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/genindex.html b/cpu/2.5.0+cpu/genindex.html index 2277ac96e..7d5f93d36 100644 --- a/cpu/2.5.0+cpu/genindex.html +++ b/cpu/2.5.0+cpu/genindex.html @@ -377,7 +377,7 @@

V

Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/index.html b/cpu/2.5.0+cpu/index.html index 5c8318428..4eb3badab 100644 --- a/cpu/2.5.0+cpu/index.html +++ b/cpu/2.5.0+cpu/index.html @@ -183,7 +183,7 @@

Support Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/py-modindex.html b/cpu/2.5.0+cpu/py-modindex.html index 8c9e02bb8..fff29e5f4 100644 --- a/cpu/2.5.0+cpu/py-modindex.html +++ b/cpu/2.5.0+cpu/py-modindex.html @@ -165,7 +165,7 @@

Python Module Index

Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/search.html b/cpu/2.5.0+cpu/search.html index abb60a5a0..c53598003 100644 --- a/cpu/2.5.0+cpu/search.html +++ b/cpu/2.5.0+cpu/search.html @@ -140,7 +140,7 @@ Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/searchindex.js b/cpu/2.5.0+cpu/searchindex.js index 082550705..5a4f1c84c 100644 --- a/cpu/2.5.0+cpu/searchindex.js +++ b/cpu/2.5.0+cpu/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"$\\alpha$ Usage": [[16, "alpha-usage"]], "1. Creating a serialized file": [[32, "creating-a-serialized-file"]], "1. Defining hyperparameters to tune:": [[14, "defining-hyperparameters-to-tune"]], "1.0.0-Alpha": [[34, "id45"]], "1.0.1-Alpha": [[34, "alpha"]], "1.0.2": [[34, "id44"]], "1.1.0": [[34, "id42"]], "1.10.0": [[34, "id32"]], "1.10.100": [[34, "id31"]], "1.11.0": [[34, "id29"]], "1.11.200": [[34, "id27"]], "1.12.0": [[34, "id24"]], "1.12.100": [[34, "id23"]], "1.12.300": [[34, "id21"]], "1.13.0": [[34, "id18"]], "1.13.100": [[34, "id16"]], "1.2.0": [[34, "id39"]], "1.8.0": [[34, "id37"]], "1.9.0": [[34, "id36"]], "2. Creating a Model Archive": [[32, "creating-a-model-archive"]], "2. Defining the search spaces of the hyperparameters:": [[14, "defining-the-search-spaces-of-the-hyperparameters"]], "2.0.0": [[34, "id14"]], "2.0.100": [[34, "id12"]], "2.1.0": [[34, "id10"]], "2.1.100": [[34, "id8"]], "2.2.0": [[34, "id6"]], "2.3.0": [[34, "id4"]], "2.3.100": [[34, "id2"]], "2.4.0": [[34, "id1"]], "3. Start TorchServe to serve the model": [[32, "start-torchserve-to-serve-the-model"]], "4. Registering and Deploying model": [[32, "registering-and-deploying-model"]], "": [[14, "your-python-script"]], "API Documentation": [[2, null], [25, "api-documentation"]], "Accuracy": [[30, "accuracy"]], "Add Custom Kernel": [[17, "add-custom-kernel"]], "Algorithm: Auto-tuning of $\\alpha$.": [[16, "algorithm-auto-tuning-of-alpha"]], "Already using Jit Trace": [[10, "already-using-jit-trace"]], "Already using ipex.optimize": [[10, "already-using-ipex-optimize"]], "Architecture": [[1, "architecture"]], "Auto Channels Last": [[7, "auto-channels-last"], [9, null]], "Auto Mixed Precision (AMP)": [[7, "auto-mixed-precision-amp"], [8, null]], "Autocast Op Reference": [[8, "autocast-op-reference"]], "BERT": [[6, "bert"], [6, "id2"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [32, "bert"]], "BFloat16": [[6, "bfloat16"], [21, "bfloat16"], [26, "bfloat16"]], "Benchmarking with Launcher": [[32, "benchmarking-with-launcher"]], "Benchmarking with Launcher Core Pinning": [[32, "benchmarking-with-launcher-core-pinning"]], "Better local unit tests with pytest": [[5, "better-local-unit-tests-with-pytest"]], "Blogs & Publications": [[3, null]], "Building documentation": [[5, "building-documentation"]], "C++": [[6, "c"]], "C++ Unit Testing": [[5, "c-unit-testing"]], "CPU Channels Last Targets": [[18, "cpu-channels-last-targets"]], "CPU ISA build compiler requirement": [[17, "cpu-isa-build-compiler-requirement"]], "CPU Runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime"]], "CPU feature check": [[17, "cpu-feature-check"]], "Calibration": [[6, "calibration"]], "Channels Last": [[18, null], [33, "channels-last"]], "Cheat Sheet": [[4, null]], "Code Folder Struct": [[17, "code-folder-struct"]], "CodeGen Process": [[17, "codegen-process"]], "Codeless Optimization (Prototype)": [[10, null]], "Codeless Optimization (Prototype, NEW feature from 1.13.0)": [[7, "codeless-optimization-prototype-new-feature-from-1-13-0"]], "Command to apply ipex optimization for BF16": [[10, "command-to-apply-ipex-optimization-for-bf16"]], "Command to apply ipex optimization for FP32": [[10, "command-to-apply-ipex-optimization-for-fp32"]], "Configuration": [[30, "configuration"], [30, "id2"], [30, "id5"]], "Contents of this Document": [[32, "contents-of-this-document"], [33, "contents-of-this-document"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[5, "contributing-to-intel-extension-for-pytorch"]], "Contribution": [[5, null]], "Convert to Dynamic Quantized Model and Deploy": [[15, "convert-to-dynamic-quantized-model-and-deploy"]], "Convert to Static Quantized Model and Deploy": [[15, "convert-to-static-quantized-model-and-deploy"]], "Creating and Exporting INT8 model for Intel\u00ae Extension for PyTorch*": [[32, "creating-and-exporting-int8-model-for-intel-extension-for-pytorch"]], "Default Precision": [[8, "default-precision"]], "Default memory allocator": [[31, "default-memory-allocator"]], "Default search space": [[14, "default-search-space"]], "Define QConfig": [[15, "id1"]], "Define qconfig": [[15, "define-qconfig"]], "Defining hyperparameters and their search spaces": [[14, "defining-hyperparameters-and-their-search-spaces"]], "Demos": [[28, "demos"]], "Denormal Number": [[33, "denormal-number"]], "Deployment": [[6, "deployment"]], "Design of Task": [[20, "design-of-task"]], "Detail Design": [[20, "detail-design"]], "Determining the alpha through auto-tuning": [[16, "determining-the-alpha-through-auto-tuning"]], "Developing Intel\u00ae Extension for PyTorch*": [[5, "developing-intel-extension-for-pytorch"]], "Dispatch Stub implementation: csrc/cpu/dyndisp/DispatchStub.cpp and csrc/cpu/dyndisp/DispatchStub.h": [[17, "dispatch-stub-implementation-csrc-cpu-dyndisp-dispatchstub-cpp-and-csrc-cpu-dyndisp-dispatchstub-h"]], "Distributed Inference": [[28, "distributed-inference"]], "Distributed Inference with DeepSpeed": [[29, "distributed-inference-with-deepspeed"]], "Distributed Training": [[6, "distributed-training"]], "Dynamic Dispatch Design": [[17, "dynamic-dispatch-design"]], "Dynamic Quantization": [[6, "dynamic-quantization"], [15, "dynamic-quantization"]], "Dynamic Shape": [[26, "dynamic-shape"]], "Eager Mode": [[6, "eager-mode"], [6, "id5"]], "Ease-of-use auto channels last API": [[9, "ease-of-use-auto-channels-last-api"]], "Ease-of-use graph optimization API": [[13, "ease-of-use-graph-optimization-api"]], "Easy-to-use Python API": [[7, "easy-to-use-python-api"]], "Example Usage with HuggingFace": [[10, "example-usage-with-huggingface"]], "Example of MultiStream Module": [[20, "example-of-multistream-module"]], "Example of asynchronous task": [[20, "example-of-asynchronous-task"]], "Example of configuring core binding": [[20, "example-of-configuring-core-binding"]], "Example:": [[17, "example"], [17, "id1"]], "Examples": [[6, null]], "Examples1: Basic Usage": [[20, "examples1-basic-usage"]], "Examples2: Usage with \u201cAUTO\u201d setting": [[20, "examples2-usage-with-auto-setting"]], "Examples3: Usage for models with structure inputs/outputs": [[20, "examples3-usage-for-models-with-structure-inputs-outputs"]], "FP32 and BF16 fusion patterns": [[13, "fp32-and-bf16-fusion-patterns"]], "FP32 and BF16 models": [[13, "fp32-and-bf16-models"]], "FP32 and BFloat16 with v1.10": [[30, "fp32-and-bfloat16-with-v1-10"]], "FP32 with v1.11.200 on an AWS EC2 C6i.2xlarge instance": [[30, "fp32-with-v1-11-200-on-an-aws-ec2-c6i-2xlarge-instance"]], "FP32/BF16": [[6, "fp32-bf16"], [29, "fp32-bf16"]], "Fast BERT (Prototype)": [[11, null]], "Fast BERT Optimization (Prototype, NEW feature from 2.0.0)": [[7, "fast-bert-optimization-prototype-new-feature-from-2-0-0"]], "Fast Bert (Prototype)": [[2, "fast-bert-prototype"], [6, "fast-bert-prototype"]], "Feature Description": [[11, "feature-description"], [12, "feature-description"]], "Features": [[7, null]], "Float32": [[6, "float32"]], "Folding": [[13, "folding"]], "Fusion": [[13, "fusion"]], "GNU OpenMP": [[33, "gnu-openmp"]], "GNU OpenMP Library": [[31, "gnu-openmp-library"]], "General": [[2, "general"]], "General Usage": [[26, "general-usage"]], "Get Started": [[25, "get-started"]], "Graph Capture (Prototype)": [[12, null]], "Graph Capture (Prototype, NEW feature from 1.13.0)": [[7, "graph-capture-prototype-new-feature-from-1-13-0"]], "Graph Optimization": [[2, "graph-optimization"], [7, "graph-optimization"], [13, null], [28, "graph-optimization"]], "Hardware Configuration": [[30, "hardware-configuration"], [30, "id7"], [33, "hardware-configuration"]], "Highlights": [[34, "highlights"], [34, "id3"], [34, "id5"], [34, "id7"], [34, "id9"], [34, "id11"], [34, "id13"], [34, "id15"], [34, "id17"], [34, "id19"], [34, "id22"], [34, "id25"], [34, "id28"], [34, "id30"], [34, "id33"]], "How the core binding is implemented": [[20, "how-the-core-binding-is-implemented"]], "HyperTune (Prototype)": [[14, null]], "HyperTune (Prototype, NEW feature from 1.13.0)": [[7, "hypertune-prototype-new-feature-from-1-13-0"]], "Hyperparameters": [[14, "hyperparameters"]], "I. Use all physical cores": [[31, "i-use-all-physical-cores"]], "II. Use all cores including logical cores": [[31, "ii-use-all-cores-including-logical-cores"]], "III. Use physical cores on designated nodes": [[31, "iii-use-physical-cores-on-designated-nodes"]], "INT8": [[6, "int8"], [26, "int8"]], "INT8 Quantization": [[7, "int8-quantization"]], "INT8 Recipe Tuning API (Prototype)": [[16, null]], "INT8 fusion patterns": [[13, "int8-fusion-patterns"]], "INT8 models": [[13, "int8-models"]], "INT8 with v1.11": [[30, "int8-with-v1-11"]], "IOMP preload or load during the runtime": [[20, "iomp-preload-or-load-during-the-runtime"]], "ISA Dynamic Dispatching": [[7, "isa-dynamic-dispatching"], [17, null]], "ISA intrinics specific kernel example:": [[17, "isa-intrinics-specific-kernel-example"]], "IV. Use your designated number of cores": [[31, "iv-use-your-designated-number-of-cores"]], "Indirect Access KV Cache": [[28, "indirect-access-kv-cache"]], "Inference": [[6, "inference"]], "Inference with Eager Path": [[8, "inference-with-eager-path"]], "Inference with TorchScript Path": [[8, "inference-with-torchscript-path"]], "Install Intel\u00ae Extension for PyTorch*": [[32, "install-intel-extension-for-pytorch"]], "Installation": [[24, null]], "Intel CPU Structure": [[33, "intel-cpu-structure"]], "Intel OpenMP": [[33, "intel-openmp"]], "Intel OpenMP Library": [[31, "intel-openmp-library"]], "Intel\u00ae AI Reference Models": [[6, "intel-ai-reference-models"]], "Intel\u00ae Extension for PyTorch*": [[1, null]], "Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc": [[0, null]], "Intel\u00ae Extension for PyTorch* optimizations for quantization": [[15, null]], "Introduction": [[8, "introduction"], [19, "introduction"], [25, null]], "Jemalloc": [[31, "jemalloc"], [33, "jemalloc"]], "Kernel Stub: csrc/cpu/aten/xyz.cpp and csrc/cpu/aten/xyz.h": [[17, "kernel-stub-csrc-cpu-aten-xyz-cpp-and-csrc-cpu-aten-xyz-h"]], "Kernel implementation: csrc/cpu/aten/kernels/xyzKrnl.cpp": [[17, "kernel-implementation-csrc-cpu-aten-kernels-xyzkrnl-cpp"]], "Known Issues": [[34, "known-issues"], [34, "id20"], [34, "id26"], [34, "id34"]], "Known issue": [[9, "known-issue"], [34, "known-issue"], [34, "id47"]], "Known issues": [[20, "known-issues"], [34, "id41"]], "LLM Module Level Optimizations (Prototype)": [[2, "llm-module-level-optimizations-prototype"]], "LLM Optimizations Frontend API": [[29, null]], "LLM Performance": [[30, "llm-performance"]], "LLM Quick Start": [[23, "llm-quick-start"]], "Large Language Model (LLM)": [[6, "large-language-model-llm"]], "Large Language Models (LLM) Optimization Overview": [[28, null]], "Large Language Models (LLM, NEW feature from 2.1.0)": [[7, "large-language-models-llm-new-feature-from-2-1-0"]], "Launch Script Usage Guide": [[31, null]], "Launcher Core Pinning to Boost Performance of TorchServe Multi Worker Inference": [[32, "launcher-core-pinning-to-boost-performance-of-torchserve-multi-worker-inference"]], "Launcher Hyperparameters": [[14, "launcher-hyperparameters"]], "License": [[27, null]], "Linear Operator Optimization": [[28, "linear-operator-optimization"]], "Local linting": [[5, "local-linting"]], "Low Precision Data Types": [[28, "low-precision-data-types"]], "Memory Allocator": [[33, "memory-allocator"]], "Memory Format Is All That Matters": [[18, "memory-format-is-all-that-matters"]], "Methodology": [[13, "methodology"]], "Module Level Optimization API for customized LLM (Prototype)": [[28, "module-level-optimization-api-for-customized-llm-prototype"]], "Module uses forward method explicitly instead of the __call__ attr": [[10, "module-uses-forward-method-explicitly-instead-of-the-call-attr"]], "Motivation": [[10, "motivation"]], "Multiple instances for inference": [[31, "multiple-instances-for-inference"]], "NOTE": [[34, "note"]], "Non-Uniform Memory Access (NUMA)": [[33, "non-uniform-memory-access-numa"]], "Numactl": [[33, "numactl"]], "OMP_NUM_THREADS": [[33, "omp-num-threads"]], "OMP_THREAD_LIMIT": [[33, "omp-thread-limit"]], "OneDNN primitive cache": [[33, "onednn-primitive-cache"]], "Op Eligibility": [[8, "op-eligibility"]], "Op-Specific Behavior": [[8, "op-specific-behavior"]], "OpenMP": [[33, "openmp"]], "Operation Fusion": [[19, "operation-fusion"]], "Operator Optimization": [[7, "operator-optimization"]], "Ops that can autocast to bfloat16": [[8, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float32": [[8, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[8, "ops-that-promote-to-the-widest-input-type"]], "Optimization Methodologies": [[28, "optimization-methodologies"]], "Optimizer Fusion": [[19, null]], "Optimizer Optimization": [[7, "optimizer-optimization"]], "Others": [[34, "others"]], "Overview": [[17, "overview"], [30, "overview"], [31, "overview"], [33, "overview"]], "Performance": [[30, null], [34, "performance"]], "Performance Boost with Intel\u00ae Extension for PyTorch* and Launcher": [[32, "performance-boost-with-intel-extension-for-pytorch-and-launcher"]], "Performance Data for Intel\u00ae AI Data Center Products": [[30, "performance-data-for-intel-ai-data-center-products"]], "Performance Improvement": [[34, "performance-improvement"]], "Performance Numbers": [[30, "performance-numbers"], [30, "id1"], [30, "id4"]], "Performance Regression": [[26, "performance-regression"]], "Performance Result": [[34, "performance-result"]], "Performance Tuning Guide": [[33, null]], "Performance recipes": [[20, "performance-recipes"]], "Prepare Model": [[15, "prepare-model"]], "Prepare Model and Do Calibration": [[15, "prepare-model-and-do-calibration"]], "Prerequisite": [[11, "prerequisite"]], "Private Debug APIs": [[17, "private-debug-apis"]], "Pseudocode of Common Usage Scenarios": [[29, "pseudocode-of-common-usage-scenarios"]], "PyTorch Channels Last Memory Format APIs": [[18, "pytorch-channels-last-memory-format-apis"]], "PyTorch Strided Layout": [[18, "pytorch-strided-layout"]], "Python": [[6, "python"]], "Python Unit Testing": [[5, "python-unit-testing"]], "Quantization": [[2, "module-intel_extension_for_pytorch.quantization"]], "Quick Start": [[23, null]], "Releases": [[34, null]], "Requirements": [[20, "requirements"]], "ResNet50": [[32, "resnet50"]], "Resnet50": [[6, "resnet50"], [6, "id1"], [6, "id3"], [6, "id6"], [6, "id9"], [6, "id12"]], "Result Correctness": [[26, "result-correctness"]], "Runtime Extension": [[7, "runtime-extension"], [20, null], [26, "runtime-extension"]], "Scaling workers": [[32, "scaling-workers"]], "Select ISA level manually.": [[17, "select-isa-level-manually"]], "Serving model with Intel\u00ae Extension for PyTorch*": [[32, "serving-model-with-intel-extension-for-pytorch"]], "Single instance for inference": [[31, "single-instance-for-inference"]], "Smooth Quant Recipe Tuning API (Prototype)": [[22, null]], "Smooth Quantization Autotune": [[16, "smooth-quantization-autotune"]], "Smooth Quantization INT8": [[6, "smooth-quantization-int8"]], "SmoothQuant": [[29, "smoothquant"]], "Software Configuration": [[33, "software-configuration"]], "Software Version": [[30, "software-version"], [30, "id3"], [30, "id6"]], "Split SGD": [[21, null], [21, "id2"]], "Static Quantization": [[6, "static-quantization"], [15, "static-quantization"]], "Stochastic Gradient Descent (SGD)": [[21, "stochastic-gradient-descent-sgd"]], "Support": [[1, "support"]], "TCMalloc": [[31, "tcmalloc"], [33, "tcmalloc"]], "The origin command with ipex launch": [[10, "the-origin-command-with-ipex-launch"]], "Tips": [[5, "tips"]], "Tips and Debugging": [[5, "tips-and-debugging"]], "TorchDynamo": [[26, "torchdynamo"]], "TorchDynamo Mode (Beta, NEW feature from 2.0.0)": [[6, "torchdynamo-mode-beta-new-feature-from-2-0-0"], [6, "id11"]], "TorchScript Mode": [[6, "torchscript-mode"], [6, "id8"]], "TorchServe with Intel\u00ae Extension for PyTorch*": [[32, null]], "TorchServe with Launcher": [[32, "torchserve-with-launcher"]], "Training": [[6, "training"]], "Training Support": [[8, "training-support"]], "Troubleshooting": [[26, null]], "Unit testing": [[5, "unit-testing"]], "Usage Example": [[11, "usage-example"], [12, "usage-example"], [16, "usage-example"]], "Usage Examples": [[14, "usage-examples"], [31, "usage-examples"]], "Usage of Hypertune": [[14, "usage-of-hypertune"]], "Usage of Jemalloc/TCMalloc/Default memory allocator": [[31, "usage-of-jemalloc-tcmalloc-default-memory-allocator"]], "Usage of OpenMP library": [[31, "usage-of-openmp-library"]], "Usage of launch script": [[31, "usage-of-launch-script"]], "Use Case": [[8, "use-case"]], "Use Case not supported": [[10, "use-case-not-supported"]], "Use Cases": [[20, "use-cases"]], "User defined search space": [[14, "user-defined-search-space"]], "Using a fixed alpha": [[16, "using-a-fixed-alpha"]], "V. Throughput mode": [[31, "v-throughput-mode"]], "VI. Latency mode": [[31, "vi-latency-mode"]], "VII. Your designated number of instances": [[31, "vii-your-designated-number-of-instances"]], "VIII. Your designated number of instances and instance index": [[31, "viii-your-designated-number-of-instances-and-instance-index"]], "Vec specific kernel example:": [[17, "vec-specific-kernel-example"]], "Verified for distributed inference mode via DeepSpeed": [[28, "verified-for-distributed-inference-mode-via-deepspeed"]], "Verified for single instance mode": [[28, "verified-for-single-instance-mode"]], "Weight Only Quantization (WOQ)": [[29, "weight-only-quantization-woq"]], "Weight Only Quantization INT8/INT4": [[6, "weight-only-quantization-int8-int4"]], "What is Channels Last": [[18, "what-is-channels-last"]], "What\u2019s Changed": [[34, "what-s-changed"], [34, "id35"]], "What\u2019s New": [[34, "what-s-new"], [34, "id38"], [34, "id40"], [34, "id43"], [34, "id46"]], "Writing Channels Last Kernels": [[18, "writing-channels-last-kernels"]], "Writing documentation": [[5, "writing-documentation"]], "a. Create NHWC Memory": [[18, "a-create-nhwc-memory"]], "a. NCHW (default)": [[18, "a-nchw-default"]], "a. Status on CPU": [[18, "a-status-on-cpu"]], "a. tensor creation": [[18, "a-tensor-creation"]], "b. Create Convolution Primitive": [[18, "b-create-convolution-primitive"]], "b. NHWC (WIP for CPU)": [[18, "b-nhwc-wip-for-cpu"]], "b. Register Channels Last Kernel in ATen Native Manner": [[18, "b-register-channels-last-kernel-in-aten-native-manner"]], "b. tensor conversion": [[18, "b-tensor-conversion"]], "c. Blocked (nChw16c)": [[18, "c-blocked-nchw16c"]], "c. Register oneDNN Kernel on Channels Last": [[18, "c-register-onednn-kernel-on-channels-last"]], "c. model conversion": [[18, "c-model-conversion"]], "d. operator coverage": [[18, "d-operator-coverage"]], "default": [[9, "default"]], "disable": [[9, "disable"]], "enable": [[9, "enable"]], "ipex.llm Optimized Model List for Inference": [[28, "ipex-llm-optimized-model-list-for-inference"]], "oneDNN NHWC APIs": [[18, "onednn-nhwc-apis"]], "torch.compile (Beta, NEW feature from 2.0.0)": [[7, "torch-compile-beta-new-feature-from-2-0-0"]], "your_conf_file": [[14, "your-conf-file"]]}, "docnames": ["design_doc/cpu/isa_dyndisp", "index", "tutorials/api_doc", "tutorials/blogs_publications", "tutorials/cheat_sheet", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/amp", "tutorials/features/auto_channels_last", "tutorials/features/codeless_optimization", "tutorials/features/fast_bert", "tutorials/features/graph_capture", "tutorials/features/graph_optimization", "tutorials/features/hypertune", "tutorials/features/int8_overview", "tutorials/features/int8_recipe_tuning_api", "tutorials/features/isa_dynamic_dispatch", "tutorials/features/nhwc", "tutorials/features/optimizer_fusion", "tutorials/features/runtime_extension", "tutorials/features/split_sgd", "tutorials/features/sq_recipe_tuning_api", "tutorials/getting_started", "tutorials/installation", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/llm_optimize", "tutorials/performance", "tutorials/performance_tuning/launch_script", "tutorials/performance_tuning/torchserve", "tutorials/performance_tuning/tuning_guide", "tutorials/releases"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["design_doc/cpu/isa_dyndisp.md", "index.rst", "tutorials/api_doc.rst", "tutorials/blogs_publications.md", "tutorials/cheat_sheet.md", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/amp.md", "tutorials/features/auto_channels_last.md", "tutorials/features/codeless_optimization.md", "tutorials/features/fast_bert.md", "tutorials/features/graph_capture.md", "tutorials/features/graph_optimization.md", "tutorials/features/hypertune.md", "tutorials/features/int8_overview.md", "tutorials/features/int8_recipe_tuning_api.md", "tutorials/features/isa_dynamic_dispatch.md", "tutorials/features/nhwc.md", "tutorials/features/optimizer_fusion.md", "tutorials/features/runtime_extension.md", "tutorials/features/split_sgd.rst", "tutorials/features/sq_recipe_tuning_api.md", "tutorials/getting_started.md", "tutorials/installation.md", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/llm_optimize.md", "tutorials/performance.md", "tutorials/performance_tuning/launch_script.md", "tutorials/performance_tuning/torchserve.md", "tutorials/performance_tuning/tuning_guide.md", "tutorials/releases.md"], "indexentries": {"autotune() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.autotune", false]], "convert() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.convert", false]], "cpupool (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.CPUPool", false]], "enable_onednn_fusion() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.enable_onednn_fusion", false]], "fast_bert() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.fast_bert", false]], "fast_layer_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.fast_layer_norm", false]], "fastlayernorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.FastLayerNorm", false]], "frozenbatchnorm2d (class in intel_extension_for_pytorch.nn)": [[7, "intel_extension_for_pytorch.nn.FrozenBatchNorm2d", false]], "get_core_list_of_node_id() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.get_core_list_of_node_id", false]], "get_smooth_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_smooth_quant_qconfig_mapping", false]], "get_weight_only_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_weight_only_quant_qconfig_mapping", false]], "indirect_access_kv_cache_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.indirect_access_kv_cache_attention", false]], "indirectaccesskvcacheattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.IndirectAccessKVCacheAttention", false]], "intel_extension_for_pytorch": [[2, "module-intel_extension_for_pytorch", false]], "intel_extension_for_pytorch.cpu.runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime", false]], "intel_extension_for_pytorch.llm": [[2, "module-intel_extension_for_pytorch.llm", false]], "intel_extension_for_pytorch.llm.functional": [[2, "module-intel_extension_for_pytorch.llm.functional", false]], "intel_extension_for_pytorch.llm.modules": [[2, "module-intel_extension_for_pytorch.llm.modules", false]], "intel_extension_for_pytorch.quantization": [[2, "module-intel_extension_for_pytorch.quantization", false]], "interaction() (in module intel_extension_for_pytorch.nn.functional)": [[7, "intel_extension_for_pytorch.nn.functional.interaction", false]], "is_runtime_ext_enabled() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.is_runtime_ext_enabled", false]], "linear2silumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.Linear2SiluMul", false]], "linearadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAdd", false]], "linearaddadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAddAdd", false]], "lineargelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearGelu", false]], "linearmul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearMul", false]], "linearnewgelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearNewGelu", false]], "linearrelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearRelu", false]], "linearsilu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSilu", false]], "linearsilumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSiluMul", false]], "mergedembeddingbag (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBag", false]], "mergedembeddingbagwithsgd (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBagWithSGD", false]], "module": [[2, "module-intel_extension_for_pytorch", false], [2, "module-intel_extension_for_pytorch.cpu.runtime", false], [2, "module-intel_extension_for_pytorch.llm", false], [2, "module-intel_extension_for_pytorch.llm.functional", false], [2, "module-intel_extension_for_pytorch.llm.modules", false], [2, "module-intel_extension_for_pytorch.quantization", false]], "multistreammodule (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModule", false]], "multistreammodulehint (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModuleHint", false]], "optimize() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.optimize", false]], "optimize() (in module intel_extension_for_pytorch.llm)": [[2, "intel_extension_for_pytorch.llm.optimize", false]], "pagedattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.PagedAttention", false]], "pin (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.pin", false]], "prepare() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.prepare", false]], "rms_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rms_norm", false]], "rmsnorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RMSNorm", false]], "rotary_embedding() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rotary_embedding", false]], "rotaryembedding (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RotaryEmbedding", false]], "task (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.Task", false]], "varlen_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.varlen_attention", false]], "varlenattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.VarlenAttention", false]], "verbose (class in intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.verbose", false]]}, "objects": {"": [[2, 0, 0, "-", "intel_extension_for_pytorch"]], "intel_extension_for_pytorch": [[2, 2, 1, "", "enable_onednn_fusion"], [2, 2, 1, "", "fast_bert"], [2, 0, 0, "-", "llm"], [2, 2, 1, "", "optimize"], [2, 0, 0, "-", "quantization"], [2, 1, 1, "", "verbose"]], "intel_extension_for_pytorch.cpu": [[2, 0, 0, "-", "runtime"]], "intel_extension_for_pytorch.cpu.runtime": [[2, 1, 1, "", "CPUPool"], [2, 1, 1, "", "MultiStreamModule"], [2, 1, 1, "", "MultiStreamModuleHint"], [2, 1, 1, "", "Task"], [2, 2, 1, "", "get_core_list_of_node_id"], [2, 2, 1, "", "is_runtime_ext_enabled"], [2, 1, 1, "", "pin"]], "intel_extension_for_pytorch.llm": [[2, 0, 0, "-", "functional"], [2, 0, 0, "-", "modules"], [2, 2, 1, "", "optimize"]], "intel_extension_for_pytorch.llm.functional": [[2, 2, 1, "", "fast_layer_norm"], [2, 2, 1, "", "indirect_access_kv_cache_attention"], [2, 2, 1, "", "rms_norm"], [2, 2, 1, "", "rotary_embedding"], [2, 2, 1, "", "varlen_attention"]], "intel_extension_for_pytorch.llm.modules": [[2, 1, 1, "", "FastLayerNorm"], [2, 1, 1, "", "IndirectAccessKVCacheAttention"], [2, 1, 1, "", "Linear2SiluMul"], [2, 1, 1, "", "LinearAdd"], [2, 1, 1, "", "LinearAddAdd"], [2, 1, 1, "", "LinearGelu"], [2, 1, 1, "", "LinearMul"], [2, 1, 1, "", "LinearNewGelu"], [2, 1, 1, "", "LinearRelu"], [2, 1, 1, "", "LinearSilu"], [2, 1, 1, "", "LinearSiluMul"], [2, 1, 1, "", "PagedAttention"], [2, 1, 1, "", "RMSNorm"], [2, 1, 1, "", "RotaryEmbedding"], [2, 1, 1, "", "VarlenAttention"]], "intel_extension_for_pytorch.nn": [[7, 1, 1, "", "FrozenBatchNorm2d"]], "intel_extension_for_pytorch.nn.functional": [[7, 2, 1, "", "interaction"]], "intel_extension_for_pytorch.nn.modules": [[7, 1, 1, "", "MergedEmbeddingBag"], [7, 1, 1, "", "MergedEmbeddingBagWithSGD"]], "intel_extension_for_pytorch.quantization": [[2, 2, 1, "", "autotune"], [2, 2, 1, "", "convert"], [2, 2, 1, "", "get_smooth_quant_qconfig_mapping"], [2, 2, 1, "", "get_weight_only_quant_qconfig_mapping"], [2, 2, 1, "", "prepare"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:function"}, "terms": {"": [2, 3, 5, 8, 10, 14, 15, 18, 19, 20, 21, 22, 26, 31, 32, 33], "0": [1, 2, 4, 5, 8, 10, 11, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 30, 31, 32, 33], "00": [31, 34], "00000": 21, "00000000000602e7": 17, "0000012345": 21, "001": [6, 8], "0016": 30, "01": [2, 4, 7, 16, 31, 32, 34], "02": [30, 32], "02x": 30, "03": 32, "03x": 30, "04": [30, 31], "04x": 30, "05": [2, 7, 10, 30, 31], "05x": 30, "06": [2, 31, 32], "06x": 30, "07": 31, "07x": 30, "08": 31, "08x": 30, "09": [17, 31], "096": 32, "09864": 2, "09x": 30, "0x00007f3cde954000": 6, "0x00007f3ce16ac000": 6, "0x00007f3cf70fc000": 6, "0x00007f3cf985a000": 6, "0x00007f3cf98e0000": 6, "0x1": 17, "0x700001c": 30, "0x7fff": 17, "0xd0002a0": 30, "0xffff": 17, "1": [1, 2, 3, 4, 6, 8, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 33], "10": [7, 14, 16, 17, 18, 21, 25, 26, 31, 32, 33], "100": [2, 4, 14, 16, 17, 30, 32], "10000": 2, "1009": 30, "100mb": 34, "1024": [30, 33], "102b": 28, "1032": 34, "10438": 2, "1053": 34, "1074": 34, "10k": 6, "10x": 30, "11": [17, 31, 32], "111": 33, "112": [26, 30, 33, 34], "117": 31, "118": 31, "11b": [28, 34], "11x": 30, "12": [6, 10, 14, 17, 30, 31, 32], "1200": 30, "12345": 21, "1234500000": 21, "1234512345": 21, "125m": 6, "127": [6, 31, 34], "128": [6, 8, 10, 13, 20, 30, 34], "128k": [2, 28, 34], "128task": 30, "1295": 34, "12b": 28, "12x": 30, "13": [3, 10, 17, 30, 31, 32, 33], "1318": 34, "1322": 34, "1328": 34, "1330": 34, "1338": 34, "1341": 34, "1353": 34, "1355": 34, "1367": 34, "1373": 34, "1376": 34, "1384": 34, "1391": 34, "1392": 34, "13b": [28, 30, 34], "13x": 30, "14": [31, 32, 34], "140": 31, "1414": 34, "1419": 34, "143": 31, "146": 31, "1473": 34, "1488": 34, "149": 31, "14x": 30, "15": [14, 17, 30, 31, 32], "151": 31, "1513": 34, "1517": 34, "154": 31, "1563": 34, "1564": 34, "1566": 34, "1568": 34, "157": 31, "1580": 34, "1585": 34, "1587": 34, "1589": 34, "159": 31, "1590": 34, "1592": 34, "1593": 34, "1594": 34, "15x": 30, "16": [2, 17, 20, 21, 30, 31, 32], "160": 30, "162": 31, "164": 31, "1664": 34, "167": 31, "1677": 34, "1682": 34, "1688": 34, "1695": 34, "16gb": 30, "16x": 30, "16xlarg": 30, "17": [6, 30, 31, 32], "170": 30, "175": 31, "176": 31, "177": 31, "17th": 30, "18": [30, 31, 32], "18x": 30, "19": [7, 30, 31, 32, 34], "199": 30, "19x": 30, "1_6b": 28, "1b7": 28, "1d": 18, "1e": [2, 7, 10, 16], "1mb": 33, "2": [1, 2, 3, 8, 10, 11, 16, 17, 18, 20, 21, 25, 26, 27, 28, 29, 30, 31, 33], "20": [2, 7, 18, 30, 31, 32, 34], "2006080250": 30, "200m": 33, "2017": 3, "2019": 3, "2020": 3, "2021": [3, 17, 31, 32], "2022": [3, 31, 32], "2023": [2, 3, 30], "2024": 33, "2048": [2, 6], "205": 34, "20b": 28, "20x": 30, "21": [30, 31, 32], "2104": 2, "2105": 30, "2137": 34, "2195": 34, "2198": 34, "21x": 30, "22": [6, 30, 31, 32], "220m": 34, "220mb": 34, "2211": 2, "2229": 34, "223": 32, "2236": 34, "224": [6, 8, 10, 12, 13, 30, 32, 34], "224m": 34, "2251": 34, "2253": 34, "2257": 34, "2264": 34, "2275": 34, "2278": 34, "2280": 34, "2283": 34, "2290": 34, "2292": 34, "2299": 34, "23": [21, 31, 32], "2315": 34, "2317": 34, "2319": 34, "233": 31, "2334": 34, "2349": 34, "235": 31, "236": 31, "2392": 34, "24": [31, 32], "2412": 34, "2433": 34, "244": 13, "2468": 34, "2469": 34, "2473": 34, "2476": 34, "2480": 34, "2491": 34, "24x": 30, "24xlarg": 32, "25": [31, 32], "2511": 34, "2550": 34, "256": [2, 30], "2561": 34, "2568": 34, "256gb": 30, "2584": 34, "26": [30, 31, 32], "2613": 34, "2617": 34, "2627": 34, "2631": 34, "2641": 34, "2663": 34, "2666": 33, "2675": 34, "26x": 30, "27": [31, 32, 33], "2704": 34, "2733": 34, "274": 32, "2747": 34, "278": 34, "27x": 30, "28": [10, 14, 16, 30, 31, 32, 33, 34], "2883": 34, "29": [7, 31, 32], "2910": 34, "2911": 34, "2928": 34, "29500": [6, 31], "2985": 34, "2987": 34, "29x": 30, "2b": 28, "2d": 18, "2nd": 28, "2x": 34, "3": [2, 5, 6, 7, 8, 10, 12, 13, 14, 16, 17, 18, 20, 21, 28, 30, 31, 33], "30": [31, 32], "3030": 34, "305": 30, "3079": 34, "3080": 34, "30b": 28, "30ghz": 30, "30x": 30, "31": [31, 32], "3116": 34, "3143": 34, "31x": 30, "32": [2, 6, 18, 21, 23, 30, 31, 32], "3200": 30, "32x": 30, "32x16d": 30, "33": [17, 31, 32], "339081764221191": 14, "33x": 30, "34": [31, 32], "35": [31, 32], "355": 31, "356": 31, "35x": 30, "36": [30, 31, 32], "36x": 30, "37": [31, 32, 34], "38": [31, 32], "384": [10, 32, 34], "384task": 30, "38x": 30, "39": [30, 31, 32, 34], "39x": 30, "3b": 28, "3d": 34, "3e": [10, 34], "3rd": [3, 7, 21, 30, 34], "4": [2, 6, 11, 13, 14, 18, 20, 23, 28, 30, 31, 33], "40": [30, 31, 32, 34], "407": 34, "409": 26, "4096": [2, 33], "40b": 28, "40mb": 34, "41": [31, 32], "42": [31, 32], "425": 34, "43": [6, 11, 31, 32], "432": 34, "438": 34, "44": [30, 31, 32], "44x": 30, "45": [6, 31, 32], "452": 34, "45x": 30, "46": [31, 32], "47": [31, 32], "470": 31, "471": 31, "473": 31, "476": 31, "479": 31, "47x": 30, "48": [30, 31, 32], "48x": 30, "49": [30, 31, 32], "49786": 34, "4bit": 34, "4k": 28, "4th": [28, 30], "4x": 3, "5": [2, 6, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 26, 28, 30, 31, 32, 33, 34], "50": [18, 31, 32], "50ghz": 33, "51": [31, 32], "512": [1, 6, 11, 16, 25, 28, 31], "513": 31, "52": [31, 32], "524": 34, "53": [31, 32], "531": 34, "54": [31, 32], "55": [31, 32, 33], "551": 34, "55x": 30, "56": [30, 31, 32, 33], "57": 31, "57x": 30, "58": [17, 31], "589": 34, "58x": 30, "59": 31, "591": 31, "5d": 16, "5m": 34, "5mb": 34, "5rc3": 34, "5x": 34, "6": [2, 5, 7, 11, 14, 20, 30, 31, 32, 33, 34], "60": 31, "602": 34, "61": 31, "62": 31, "62x": 30, "63": [31, 34], "64": [2, 8, 10, 16, 20, 30, 31, 34], "642": 34, "647": 34, "648": 34, "64byte": 34, "64gb": 30, "65": 31, "654": 31, "655": 31, "65536": 33, "657": 34, "66": [17, 31, 34], "67": [30, 31, 34], "674": 34, "67x": 30, "68": [31, 34], "684": 34, "685": 34, "69": [30, 31], "692": 34, "6b": [2, 28, 30], "7": [10, 14, 17, 20, 21, 31, 32, 34], "70": 31, "70b": [28, 34], "71": 31, "711": 34, "71x": 30, "72": 31, "73": 31, "74": 31, "75": [30, 31], "75x": 30, "76": [30, 31], "760": [31, 32], "761": [31, 32], "762": 32, "763": 32, "764": 31, "768gb": 30, "77": 31, "77x": 30, "78": [30, 31], "784": 31, "787": 34, "78x": 30, "79": [30, 31], "7b": [6, 28, 30, 34], "7f": 16, "7m": 34, "7x": 34, "8": [14, 16, 30, 31, 32, 33], "80": [5, 30, 31], "81": [30, 31], "8180": 32, "8180m": [14, 33], "81x": 30, "82": 31, "822": 34, "83": [31, 33], "8375c": 32, "8380": 30, "8380h": 30, "83x": 30, "84": [6, 30, 31, 33], "85": [30, 31], "85x": 30, "86": [30, 31], "87": 31, "88": 31, "8b": 28, "8x": 18, "8x7b": 28, "9": [6, 7, 14, 17, 23, 25, 31, 32], "9000": 32, "9000000000": [31, 33], "9001": 32, "9002": 32, "9003": 32, "90ghz": 30, "92": 30, "93": 30, "96": 30, "96x": 30, "97": 30, "975": 32, "98": 30, "981": 32, "982": 32, "99": [16, 30, 34], "992": 34, "A": [2, 5, 6, 7, 10, 11, 17, 26, 28, 31, 33, 34], "And": [15, 20, 32, 34], "As": [10, 19, 20, 28, 31, 32, 33, 34], "At": [7, 17], "But": [17, 18], "By": [17, 31, 33], "For": [1, 2, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 31, 32, 33, 34], "If": [2, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 20, 26, 31, 32, 33, 34], "In": [1, 2, 6, 7, 8, 12, 16, 17, 18, 19, 21, 23, 28, 31, 32, 33, 34], "It": [2, 6, 7, 8, 10, 13, 17, 18, 20, 21, 23, 26, 29, 31, 33, 34], "Its": 28, "NOT": [18, 31], "No": [2, 18, 34], "Not": 2, "ON": 30, "On": [1, 2, 7, 18, 28, 33], "One": [2, 3, 18, 19, 31, 33], "Such": 17, "The": [0, 1, 2, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "Then": 32, "There": [14, 16, 20, 33, 34], "These": [1, 5, 6, 7, 8, 13, 28], "To": [2, 5, 6, 7, 10, 13, 15, 16, 17, 18, 20, 21, 23, 28, 32, 33, 34], "Will": [6, 18], "With": [1, 2, 7, 10, 20, 31, 34], "_": [13, 15, 16, 17, 18, 20, 30, 31, 32, 33, 34], "___": 13, "_____": 13, "__init__": [5, 6, 8, 10, 16, 20, 26, 34], "__m256i": 17, "__m512": 17, "__m512i": 17, "__main__": [26, 31, 32, 34], "__name__": [26, 34], "_appli": 18, "_build": 5, "_c": [17, 26], "_cmp_ord_q": 17, "_core": 31, "_cvt_fp32_to_bf16": 17, "_get_current_isa_level": 17, "_get_highest_binary_support_isa_level": 17, "_get_highest_cpu_support_isa_level": 17, "_jit_set_texpr_fuser_en": 26, "_lu_with_info": 8, "_mm256_mask_storeu_epi16": 17, "_mm256_storeu_si256": 17, "_mm512_add_epi32": 17, "_mm512_and_si512": 17, "_mm512_castps_si512": 17, "_mm512_cmp_ps_mask": 17, "_mm512_cvtneps_pbh": 17, "_mm512_cvtusepi32_epi16": 17, "_mm512_loadu_p": 17, "_mm512_mask_blend_epi32": 17, "_mm512_maskz_loadu_p": 17, "_mm512_set1_epi32": 17, "_mm512_srli_epi32": 17, "_native_multi_head_attent": 8, "_reorder_cach": 2, "_sym": 2, "_timestamp_inst": 31, "_timestamp_instance_": 31, "ab": [13, 32], "abi": [6, 17, 34], "abil": 16, "abl": 15, "abnorm": [26, 34], "about": [1, 2, 5, 7, 13, 16, 32, 33, 34], "abov": [2, 5, 10, 19, 28, 30, 31, 32], "absolut": [2, 31], "abstract": [2, 11, 20], "acceler": [1, 2, 3, 6, 7, 13, 28, 29, 30, 34], "accept": [2, 34], "access": [2, 6, 7, 18, 19, 32, 34], "accommod": 18, "accompani": 34, "accord": [2, 13, 28, 33, 34], "accordingli": 16, "account": 6, "accu": 16, "accumul": 2, "accur": 8, "accuraci": [2, 3, 6, 7, 8, 15, 16, 21, 22, 26, 28, 34], "accuracy_criterion": [2, 4, 16, 34], "accuracy_criterion_typ": 2, "accuracy_criterion_valu": 2, "achang": 15, "achiev": [1, 2, 6, 7, 28, 33, 34], "across": 16, "act": 34, "act_ic_observ": 2, "act_observ": 2, "act_quant_mod": 2, "action": [6, 23], "activ": [2, 6, 7, 15, 16, 20, 28, 31, 33], "actual": [18, 21], "acycl": 13, "ad": [2, 7, 10, 33, 34], "adagrad": [19, 21], "adagrad_fused_step": 19, "adagrad_step": 19, "adam": 34, "adapt": 7, "adaptive_avg_pool3d": 8, "adaptive_max_pool3d": 8, "adaptiveaveragepoolingkrnl": 17, "add": [2, 5, 7, 8, 13, 14, 19, 21, 32, 34], "add_": 19, "add_argu": [6, 23], "add_casual_mask": 2, "add_execut": 6, "add_help": [6, 23], "addbmm": 8, "addcdiv_": 19, "addcmul_": 19, "addit": [2, 6, 7, 17, 21, 28, 34], "addition": 32, "addlayernorm": 34, "addmm": 8, "addmm_": 8, "addr": 31, "address": [7, 18, 31, 32, 33, 34], "addtion": 17, "adjust": 16, "adopt": [28, 34], "advanc": [1, 2, 6, 7, 16, 25, 28], "advantag": [1, 2, 7, 9, 12, 18, 21, 25, 30, 31, 33], "aes_ni": 17, "affect": [2, 31], "affin": [7, 10, 15, 20, 31, 32, 33], "affinit": 32, "after": [2, 5, 7, 13, 20, 21, 23, 24, 32, 33, 34], "afterward": [31, 33], "ag": 7, "again": [5, 19, 32], "against": 6, "agre": 5, "ahead": 5, "ai": [1, 2, 3, 7, 28], "aim": [7, 10, 16, 33], "aka": [7, 18], "albert": 34, "algorithm": [2, 13, 18, 30, 34], "alia": 2, "alibi": 2, "alibi_slop": 2, "align": [17, 18, 21, 34], "aliv": 32, "all": [2, 5, 6, 8, 13, 14, 17, 19, 20, 28, 29, 32, 33, 34], "all_logical_cor": 14, "all_physical_cor": 14, "allcat": 2, "allenai": 26, "alloc": [2, 10, 20, 28, 30, 32, 34], "allow": [2, 8, 14, 16, 22, 33, 34], "allreduc": 2, "almost": 18, "along": [2, 5, 6, 21, 33, 34], "alpha": [2, 6, 19, 22], "alpha_max": [16, 22], "alpha_min": [16, 22], "alpha_step": [16, 22], "alphafold2": 34, "alreadi": [1, 5, 6, 18, 28, 33], "also": [1, 2, 6, 7, 10, 13, 14, 16, 18, 19, 28, 30, 31, 33, 34], "altern": [2, 6, 18], "although": [2, 33], "alwai": [5, 6, 7, 8, 18, 31, 33, 34], "amazon": 32, "among": [2, 31, 32, 33], "amount": [2, 16, 26, 28, 33], "amp": [4, 6, 10, 23, 26, 34], "amp_dtyp": [6, 23], "amp_en": [6, 23], "ampconf": 34, "amplifi": 1, "amx": [1, 3, 6, 7, 17, 25, 28, 30], "amx_bf16": 17, "amx_int8": 17, "amx_til": 17, "an": [1, 2, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 26, 31, 32, 33, 34], "anaconda": 17, "analysi": 33, "ani": [2, 5, 8, 10, 17, 18, 32, 34], "announc": 34, "anonym": 17, "anoth": [14, 31, 33, 34], "answer": [18, 30], "anymor": [7, 34], "anyplac": 4, "ao": [2, 6, 15], "apach": [27, 32], "api": [1, 3, 6, 10, 11, 15, 20, 26, 33, 34], "app": [6, 34], "append": [6, 7], "append_torchlib_if_found": 6, "appli": [2, 6, 7, 8, 12, 13, 16, 18, 19, 21, 23, 26, 28, 29, 31, 34], "applic": [1, 2, 7, 20, 28, 32, 33], "apply_funct": 2, "appropri": 33, "apr": 3, "ar": [1, 2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "arang": [2, 6, 16], "arbitrari": 2, "arc": 3, "architectur": [2, 28, 30, 33], "area": [7, 14], "aren": 5, "arg": [2, 4, 6, 7, 14, 16, 19, 23, 31, 32, 34], "argc": 6, "argmax": 16, "argpars": [6, 23], "argument": [2, 6, 7, 22, 26, 31], "argumentpars": [6, 23], "argv": 6, "around": 31, "arrai": 18, "articl": [30, 33], "arxiv": 2, "ask": 5, "assign": [18, 31, 32, 33], "assum": [2, 7, 8, 23, 32, 33, 34], "asu": 33, "asymmetr": 2, "async": [20, 34], "asynchron": [2, 7], "aten": [2, 6, 7, 34], "aten_cpu_cap": 17, "attach": 33, "attent": [1, 2, 7, 28, 34], "attention_mask": [2, 6], "attention_mask_pad": 6, "attn_output": 2, "attn_weight": 2, "attribut": 18, "aug": [3, 30], "auto": [2, 6, 10, 17, 18, 22, 23, 26, 28, 31, 33, 34], "auto_alpha_arg": 16, "auto_ipex": 34, "auto_kernel_select": [2, 7, 30], "autocast": [4, 6, 7, 10, 23, 34], "autoclass": 5, "autoconfig": [6, 23], "autofunct": 5, "autom": [4, 7, 8, 14, 31, 32, 34], "automat": [1, 2, 6, 7, 9, 10, 12, 13, 15, 16, 18, 22, 28, 31, 32, 33, 34], "automaticlli": 2, "automixprecis": 34, "automodelforcausallm": [6, 23, 29, 34], "autotoken": [6, 23], "autotp": 28, "autotun": [2, 4, 22, 34], "avaiabl": 2, "avail": [1, 2, 6, 7, 11, 17, 20, 22, 23, 29, 31, 33, 34], "avg_pool3d": 8, "avoid": [2, 10, 20, 21, 26, 31, 32, 33, 34], "avx": [1, 6, 17, 25, 28], "avx2": [17, 26, 34], "avx256": 17, "avx2_vnni": 17, "avx512": [7, 17, 18, 32, 34], "avx512_4fmap": 17, "avx512_4vnniw": 17, "avx512_bf16": 17, "avx512_bitalg": 17, "avx512_bw": 17, "avx512_cd": 17, "avx512_core_vnni": 34, "avx512_dq": 17, "avx512_er": 17, "avx512_f": 17, "avx512_fp16": 17, "avx512_ifma": 17, "avx512_pf": 17, "avx512_vbmi": 17, "avx512_vbmi2": 17, "avx512_vl": 17, "avx512_vnni": 17, "avx512_vp2intersect": 17, "avx512_vpclmul": 17, "avx512_vpopcntdq": 17, "avx_vnni": 17, "awar": [18, 20, 31, 32], "awq": [2, 34], "b": [7, 8, 16, 28], "back": [6, 12, 17, 18, 21, 26], "backbon": 2, "backend": [1, 2, 3, 6, 7, 12, 13, 16, 17, 23, 26, 28, 31, 33, 34], "background": 33, "background_thread": [31, 33], "backpropag": 16, "backward": [6, 7, 8, 16, 21, 33, 34], "bactchnorm": 34, "baddbmm": 8, "bag": [26, 34], "baichuan": [2, 28, 34], "baichuan2": [28, 34], "bake": 34, "balanc": [7, 16, 22, 33], "bandwidth": 28, "base": [1, 2, 3, 4, 5, 6, 7, 10, 11, 17, 20, 21, 26, 28, 29, 30, 32, 33, 34], "base_dir": 29, "base_text_classif": 30, "baselin": [16, 22, 34], "basic": [2, 4, 16, 21, 33, 34], "batch": [2, 6, 7, 13, 16, 18, 20, 23, 26, 30, 32, 34], "batch_decod": [6, 23], "batch_id": 6, "batch_idx": [6, 13], "batch_siz": [2, 6, 11, 13, 16, 18, 23, 32], "batchnorm": [13, 17, 18, 26, 34], "batchnorm2d": [7, 10, 26, 34], "batchsiz": [2, 20], "beam": [2, 28], "beam_idx": 2, "beam_idx_tmp": 6, "beam_width": 28, "becam": 34, "becaus": [8, 17, 18, 21, 28, 33, 34], "becom": [7, 28, 33], "been": [0, 1, 6, 7, 10, 17, 18, 28, 31, 33, 34], "beeter": 28, "befor": [1, 2, 5, 6, 13, 14, 17, 18, 20, 31, 33, 34], "begin": 5, "beginn": 16, "behavior": [2, 20, 31, 33], "behaviour": 10, "being": [7, 33], "believ": [8, 18], "below": [6, 8, 10, 14, 19, 20, 21, 22, 23, 26, 28, 31, 32, 33, 34], "bench": 32, "benchmark": [6, 26, 30, 31, 34], "benefici": 18, "benefit": [6, 7, 8, 10, 20, 21, 28, 32, 33, 34], "benifit": 2, "bert": [3, 4, 10, 30, 34], "bert_int8_jit": 32, "bert_ipex_int8": 32, "bertmodel": [4, 6, 11, 32], "bertmodelmodel": 4, "besid": [28, 33, 34], "best": [2, 6, 7, 8, 14, 16, 17, 22, 24, 28, 33, 34], "beta": [23, 26], "better": [1, 2, 6, 7, 15, 18, 20, 28, 31, 32, 33, 34], "between": [7, 8, 17, 20, 33, 34], "beyond": 7, "bf16": [2, 3, 7, 17, 19, 21, 23, 26, 28, 30, 34], "bf16_gw": 21, "bf16_w": 21, "bfloat16": [2, 3, 4, 7, 10, 11, 17, 18, 23, 29, 31, 34], "bfp16": 34, "bia": [2, 8, 20, 34], "big": [7, 18], "bigcod": 28, "bigscienc": 28, "bin": [5, 6, 17, 31, 32], "binari": [5, 6, 7, 8, 17, 34], "binary_cross_entropi": 8, "binary_cross_entropy_with_logit": 8, "bind": [6, 7, 31, 32, 33, 34], "bio": 30, "bit": [21, 28], "blob": 2, "block": [2, 5, 16, 20, 22, 28, 33, 34], "block_numb": 2, "block_siz": 2, "block_tabl": 2, "blocktim": 31, "blockwis": 16, "blog": [2, 34], "bloom": [2, 28], "bmm": [8, 34], "bmp": 18, "bn": [2, 10, 15, 26, 34], "bn_fold": 2, "bodi": 17, "bool": [2, 14], "boolean": [7, 34], "booltensor": 7, "boost": [3, 6, 7, 9, 21, 30, 31, 33, 34], "both": [1, 2, 6, 7, 16, 18, 19, 21, 28, 29, 31, 32, 33, 34], "bother": 16, "bottl": 19, "bottleneck": [2, 28], "bottom": 21, "bound": [19, 20, 28, 33], "box": [6, 10, 33], "branch": [1, 7, 30], "break": [6, 16, 34], "brew": 5, "brief": [18, 28, 34], "briefli": 33, "bring": [2, 6, 7, 9, 15, 16, 21, 28, 31, 33, 34], "broad": [7, 9, 34], "broader": 34, "brought": [33, 34], "buffer": [2, 28], "bug": [1, 5, 34], "bui": 21, "build": [6, 28, 33, 34], "built": [7, 17, 20, 34], "busi": 33, "c": [1, 7, 8, 16, 17, 20, 26, 28, 31, 32, 33, 34], "c1": 20, "c10": [6, 17], "c620": 33, "cach": [2, 5, 7, 19, 20, 30, 34], "cache_weight_for_large_batch": 2, "caff": 3, "calcul": [1, 2, 8, 16, 21, 22], "cali_dataset": 34, "calib_dataload": [2, 6, 16, 34], "calib_dataset": [6, 29], "calib_evalu": 6, "calib_func": 2, "calib_sampl": 29, "calibr": [2, 13, 22, 26, 29, 30, 32, 34], "calibrated_model": 34, "calibration_data_load": [4, 6, 13], "calibration_data_set": [15, 34], "calibration_model": 29, "calibration_sampl": 6, "call": [2, 6, 8, 13, 17, 18, 21, 26, 32, 33, 34], "caller": [26, 34], "can": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 28, 29, 30, 31, 32, 33, 34], "cannot": [8, 19, 26, 31, 34], "canon": 18, "capabl": [3, 17, 34], "capac": [21, 30], "captur": [4, 34], "card": 18, "care": 32, "carri": 30, "case": [2, 6, 7, 9, 12, 16, 17, 18, 28, 31, 33, 34], "cases": 32, "cast": [2, 8, 21, 28], "casual": 26, "cat": [8, 31, 32, 34], "catch": 6, "categor": 7, "categori": [8, 34], "caus": [2, 7, 21, 26, 28, 31, 33, 34], "causal": 2, "cc": [5, 6, 17], "ccl": [6, 31, 34], "cd": [5, 6], "cdist": 8, "center": 34, "cento": 30, "cerr": 6, "certain": [1, 7, 26, 28, 29, 31, 33], "ch_axi": 2, "chain": 21, "chang": [2, 5, 6, 7, 8, 10, 11, 12, 15, 17, 18, 20, 23, 25, 26, 29, 31], "changed_onli": 5, "changelog": 34, "channel": [2, 3, 10, 15, 16, 26, 34], "channels_last": [6, 7, 18, 23, 33, 34], "char": 6, "charact": 5, "chat": 28, "chatglm": [2, 28], "chatglm2": [28, 34], "chatglm3": [28, 34], "cheat": 23, "check": [2, 5, 6, 7, 13, 18, 28, 29, 31, 34], "check_trac": [6, 13, 32], "checkpoint": [2, 6, 29], "checkpoints_json": 29, "chip": 33, "chipset": 33, "choic": [6, 21, 23, 31], "choleski": 8, "cholesky_invers": 8, "cholesky_solv": 8, "choos": [6, 8, 20, 23, 31, 33, 34], "chosen": [8, 14, 17], "chw": 18, "chwn": 18, "ci": 5, "cifar10": [6, 13], "circumst": 8, "clamp": 13, "clang": 5, "class": [2, 5, 6, 7, 8, 10, 16, 20, 26, 34], "classif": [26, 30], "claus": [7, 10, 19], "clean": 5, "clear": 10, "clibrat": 34, "click": 3, "clone": 5, "close": [18, 31, 33], "cloud": 3, "clr": 19, "cmake": [5, 6, 17, 34], "cmake_minimum_requir": 6, "cmakefil": 17, "cmakelint": 5, "cmakelist": 6, "cnn": [7, 18, 26, 30, 33, 34], "co": [2, 34], "coco": 30, "code": [1, 2, 5, 6, 7, 10, 11, 12, 13, 18, 19, 21, 23, 24, 26, 27, 29, 33, 34], "codegen": [2, 28, 34], "codeless": 31, "codellama": 28, "codenam": 34, "collabor": 3, "collate_batch": 6, "collate_fn": 6, "collect": [6, 32, 33, 34], "column": 6, "com": [2, 5, 34], "combin": [2, 12, 14, 28, 31, 34], "come": 33, "comma": 33, "command": [4, 5, 6, 14, 23, 31, 32, 33, 34], "comment": [5, 14, 17, 22, 34], "commit": 5, "common": [17, 21, 28, 31, 33], "commonli": [7, 28, 33, 34], "commun": [6, 28, 31, 32, 33, 34], "communication_backend_nam": 29, "compact": [31, 32, 33], "compar": [1, 2, 7, 13, 18, 21, 26, 28, 30, 31, 33, 34], "compat": [17, 21], "compet": 33, "competit": 33, "compil": [1, 5, 6, 23, 26, 33, 34], "complet": [5, 6, 14, 18, 22, 29, 33], "complex": 17, "complexdoubl": 17, "complexfloat": 17, "complic": [26, 31, 33], "complier": 17, "compon": [15, 26, 27, 28], "compos": [6, 13], "comprehens": [1, 34], "compressor": [3, 7, 16, 22, 34], "compris": 18, "compuat": 13, "comput": [2, 6, 7, 13, 15, 16, 18, 20, 21, 28, 30, 31, 32, 33, 34], "concat": [2, 20, 26, 28, 34], "concat_fp32_from_bf16": 21, "concat_linear": 2, "concat_output": 2, "concaten": [2, 21], "concept": [18, 33], "concern": 7, "conclud": [30, 34], "conclus": 18, "concurr": [32, 33], "conda": [5, 33], "conda_prefix": [31, 32], "condit": 27, "conduct": 7, "conf": [4, 13, 14, 31, 34], "conf_fil": [14, 34], "confer": 3, "config": [2, 6, 11, 23, 31, 32], "configur": [2, 4, 6, 7, 14, 15, 16, 17, 31, 32, 34], "confirm": 31, "conflict": [7, 17], "connect": 33, "consecut": 33, "consider": 16, "consist": [16, 28, 33, 34], "const": [6, 17], "constant": 13, "constraint": [2, 34], "construct": [2, 7, 13], "consum": [7, 14], "consumpt": 34, "contain": [2, 5, 6, 13, 17, 26, 31, 32, 33, 34], "containeraliasingtest": 5, "content": [29, 34], "context": [2, 5, 6, 8, 20, 28, 33, 34], "context_len": 2, "contigu": [6, 13, 18, 33, 34], "contiguous_format": [18, 33], "continu": [31, 32, 34], "contribut": [28, 31, 34], "control": [1, 2, 7, 20, 26, 31, 33, 34], "conv": [2, 8, 10, 13, 15, 20, 26, 34], "conv1d": [8, 13], "conv2": 20, "conv2d": [2, 7, 8, 10, 13, 18, 20, 26, 34], "conv3d": [8, 13, 34], "conv_bn": 2, "conv_bn_fold": [2, 26, 34], "conv_tbc": 8, "conv_transpose1d": 8, "conv_transpose2d": 8, "conv_transpose3d": 8, "conveni": [8, 34], "convers": [2, 8, 13, 34], "convert": [1, 2, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23, 26, 32, 34], "convert_model": [4, 13, 15, 16], "converted_model": [4, 6, 26, 34], "convolut": [2, 6, 7, 13, 20, 33, 34], "convolution1d": 34, "convolutuon": 2, "convrelu": 13, "convsumrelu": 13, "convtranspose2d": [2, 13], "convtranspose3d": 13, "coo": 18, "cooper": [7, 30, 34], "copi": [5, 17, 18], "copyright": [17, 27], "core": [2, 7, 14, 17, 30, 33, 34], "core_id": [2, 20, 31], "correct": [7, 18, 25, 34], "correspond": [20, 31, 34], "cosine_embedding_loss": 8, "cost": [2, 6, 28, 30, 33], "costli": 33, "could": [7, 13, 16, 18, 26, 32, 33, 34], "count": 31, "counterpart": [2, 7, 18, 34], "coupl": [20, 33, 34], "cout": 6, "cover": [13, 18, 31], "cpp": [5, 6, 33], "cppsdk": 34, "cpu": [1, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 19, 20, 23, 25, 26, 28, 30, 31, 32, 34], "cpu_capability_avx512": 17, "cpu_capability_avx512_bf16": 17, "cpu_featur": 17, "cpu_feature_main": 17, "cpu_launcher_arg": 32, "cpu_launcher_en": 32, "cpu_pool": [2, 20, 34], "cpu_pool1": 20, "cpu_pool2": 20, "cpuid": 17, "cpuinfo": 17, "cpunodebind": 33, "cpupool": [2, 20, 34], "crash": [31, 33, 34], "creat": [7, 16, 20, 33, 34], "creation": 2, "creator": 34, "credit": 17, "criteria": 16, "criterion": [6, 8, 16, 22], "cross": [32, 33, 34], "cross_entropy_loss": 8, "crossentropyloss": [6, 16], "csrc": 26, "csv": 14, "ctc_loss": 8, "cu": 5, "cu_seqlens_kv": 2, "cu_seqlens_q": 2, "cudnn": 18, "current": [1, 2, 5, 7, 11, 13, 14, 15, 16, 17, 19, 20, 26, 28, 29, 34], "current_posit": 2, "custom": [1, 2, 7, 26, 34], "customized_forward": 10, "cv": 34, "cvt_fp32_to_bf16": 17, "cvt_fp32_to_bf16_kernel_fn": 17, "cvt_fp32_to_bf16_kernel_impl": 17, "cvt_fp32_to_bf16_kernel_stub": 17, "cvtfp32tobf16": 17, "cvtfp32tobf16krnl": 17, "cxx": [6, 17], "cxx11": 34, "cxx_standard": 6, "d": [4, 5, 6, 7, 8, 13, 26, 28, 34], "d8": 33, "d__avx512f__": 17, "d__avx__": 17, "dag": 13, "daili": 34, "data": [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23, 26, 31, 32, 34], "data_typ": 18, "databrick": 28, "dataload": [2, 6, 10, 13, 16, 20, 22, 29, 34], "dataset": [6, 13, 16, 29, 30, 33, 34], "dataset_nam": [10, 34], "datatyp": [20, 34], "date": 34, "dcmake_prefix_path": 6, "dcpmm": 30, "dcpu_cap": 17, "dcpu_capability_amx": 17, "dcpu_capability_avx2": 17, "dcpu_capability_avx512": 17, "dcpu_capability_avx512_bf16": 17, "dcpu_capability_avx512_fp16": 17, "dcpu_capability_avx512_vnni": 17, "dcpu_capability_default": 17, "ddp": [2, 6], "ddr": 30, "ddr4": 33, "dealloc": 33, "debug": [2, 31], "debug_squad": [10, 34], "dec": 3, "decai": 7, "decid": [2, 15, 20, 28], "decim": 21, "declar": 17, "decltyp": 17, "decod": [2, 28, 30, 34], "deconv3d": 34, "decor": 2, "dedic": [2, 6, 28, 34], "deduct": 31, "deep": [3, 7, 8, 11, 13, 14, 21, 33], "deepcopi": 2, "deepspe": [2, 34], "def": [2, 6, 8, 10, 16, 20, 26, 34], "default": [2, 4, 6, 7, 10, 12, 13, 15, 16, 17, 20, 22, 23, 26, 28, 30, 32, 33, 34], "default_dynamic_qconfig": [15, 32], "default_dynamic_qconfig_map": 6, "default_dynamic_qconfigprepared_model": 4, "default_static_qconfig": [13, 15, 32, 34], "default_static_qconfig_map": 6, "default_static_qconfigprepared_model": 4, "defin": [2, 5, 6, 7, 8, 10, 16, 17, 18, 22, 32], "definit": [17, 21, 34], "deinit": 5, "deliv": [7, 28, 34], "demand": [2, 7], "demonstr": [6, 18, 26, 32], "demostr": 23, "denomin": 2, "denot": 21, "dens": [7, 18], "dep": 34, "depend": [5, 7, 17, 18, 25, 26, 33, 34], "deploi": 34, "deploy": [2, 7, 13, 34], "deployment_mod": [2, 6, 23], "deprec": [3, 26], "dequant": [13, 16], "desc": 18, "describ": [8, 13, 18, 21, 32, 33], "descript": [4, 7, 16, 18, 20, 25, 33, 34], "descriptor": 34, "design": [2, 5, 8, 18, 21, 29, 34], "desir": [16, 31], "destroy_process_group": 6, "destruct": 33, "detail": [2, 5, 6, 7, 8, 9, 11, 13, 17, 18, 24, 25, 26, 28, 30, 32, 33, 34], "detect": [1, 6, 12, 17, 26, 33, 34], "detectron2": 18, "determin": [2, 6, 17, 21, 33], "develop": [1, 3, 6, 28, 30, 33, 34], "devic": [1, 2, 15, 29, 31, 34], "device_nam": [7, 8], "diagram": [18, 33], "dict": [2, 6, 23], "dictionari": 34, "did": [33, 34], "didn": 20, "differ": [1, 2, 7, 15, 16, 17, 18, 20, 28, 31, 32, 33, 34], "difficult": 18, "difficulti": 16, "diffus": [3, 34], "digit": 21, "dim": [2, 6, 18, 23], "dimens": [2, 18, 26], "dinner": [6, 23], "dir": [17, 31], "direct": [2, 5, 13], "directli": [2, 6, 33, 34], "directori": [1, 5, 6, 14, 29, 31, 32], "dirty_decay_m": [31, 33], "disabl": [2, 6, 7, 13, 26, 31, 33, 34], "disable_auto_channels_last": 9, "disable_iomp": [14, 32], "disable_numactl": [14, 32], "disadvantag": 21, "discret": 1, "discrete gpu": 1, "discuss": [5, 18, 33], "dispatch": [1, 34], "dist": 6, "dist_sampl": 6, "distilbert": 30, "distribut": [2, 3, 7, 16, 31, 32, 33, 34], "distributeddataparallel": [6, 34], "distributedsampl": 6, "div": 13, "divid": [2, 13, 31, 32, 33, 34], "divis": [2, 20], "divisor": [2, 20], "dl": [3, 7, 34], "dlopen": 20, "dlrm": [3, 7, 26, 30, 34], "dnnl": 30, "dnnl_verbos": 2, "do": [2, 5, 8, 16, 18, 20, 21, 26, 28, 30, 31, 32, 33, 34], "do_ev": [10, 34], "do_sampl": [6, 23], "doc": [1, 2, 5, 11, 29, 34], "doc_strid": [10, 34], "docker": [30, 34], "dockerfil": 34, "dockerhub": 34, "docstr": 5, "document": [0, 7, 17, 20, 29, 34], "doe": [2, 7, 13, 18, 20, 26, 34], "doesn": [2, 15, 16, 18, 26, 34], "dolli": [28, 34], "domin": [1, 7, 28], "don": [2, 5, 8, 14, 17, 34], "done": [6, 10, 16, 17, 26, 33, 34], "dot": [2, 7, 18, 28], "doubl": 17, "down": [5, 32, 34], "download": [6, 13, 16], "downstream": 8, "dpc": 1, "dpcpp": 34, "dram": 2, "dramat": [32, 33], "drawback": [2, 21], "drive": [1, 7, 28], "driven": 2, "drop": [31, 32], "dropout": [2, 10], "dst": 17, "dtype": [2, 4, 6, 7, 8, 10, 11, 13, 15, 16, 17, 23, 26, 29, 31, 34], "due": [1, 8, 10, 17, 20, 26], "dummi": 32, "dummy_tensor": 32, "dummymodul": 10, "dump": [2, 31], "durat": [2, 21], "dure": [4, 6, 7, 10, 13, 16, 21, 31, 33, 34], "dynam": [1, 4, 20, 28, 32, 33, 34], "dynamic_qconfig": 15, "dynamic_quantized_model": 6, "e": [1, 2, 6, 7, 8, 12, 16, 17, 18, 28, 31, 33, 34], "each": [2, 8, 14, 16, 17, 19, 20, 21, 31, 32, 33, 34], "eager": [1, 7, 12, 23, 32, 34], "earli": [2, 34], "earlier": 21, "eas": [7, 18, 34], "easi": [1, 3, 21], "easier": [2, 18, 21], "easili": [10, 15], "ec2": 32, "edit": [5, 26, 34], "effect": [2, 17, 21, 26, 32, 33], "effici": [1, 7, 11, 19, 20, 28, 31, 33, 34], "effort": 34, "eig": 8, "einsum": 34, "either": [2, 26, 31], "el8_4": 30, "elaps": 33, "element": [2, 18, 19], "eleutherai": [2, 28], "elif": 6, "elimin": 28, "els": [6, 14, 17, 18, 23], "elser": 34, "eltwis": 34, "elu": 13, "emb": 7, "emb1": 7, "emb2": 7, "emb3": 7, "emb_m": 7, "embed": [2, 7, 28, 34], "embedding_bag": 10, "embedding_spec": 7, "embeddingbad": 34, "embeddingbag": [7, 26, 34], "embeddingspec": 7, "embedingbag": 7, "emblist": 7, "emerg": [1, 7, 28], "emphas": 33, "emply_lik": 2, "empow": 3, "empti": [18, 31], "enabl": [1, 2, 3, 4, 6, 7, 8, 10, 13, 16, 18, 20, 22, 23, 26, 28, 31, 32, 33, 34], "enable_auto_channels_last": 9, "enable_auto_mix_precis": 34, "enable_auto_mixed_precis": 34, "enable_auto_optim": 34, "enable_blockwise_loss": [16, 22], "enable_jemalloc": 32, "enable_onednn_fus": [2, 13], "enable_tcmalloc": 32, "encod": 34, "encount": [26, 34], "encourag": 34, "end": [6, 13, 20, 34], "endif": 17, "endl": 6, "engin": [1, 6, 18, 33], "enhanc": [1, 3, 28, 34], "enough": [2, 7, 19], "ensur": [11, 19, 20, 32], "entir": [2, 16, 28], "enumer": [6, 13, 16, 29], "env": [6, 29], "env_key1": 5, "env_key2": 5, "env_val1": 5, "env_val2": 5, "environ": [2, 5, 6, 17, 20, 24, 28, 30, 31, 32, 33], "ep": [2, 7, 10, 19], "epoch": 16, "equal": [2, 15, 20, 31, 32, 33], "equip": 33, "equival": 34, "error": [2, 5, 6, 7, 10, 16, 18, 21, 22, 26, 34], "especi": [2, 5, 28, 34], "etc": [2, 5, 6, 17, 34], "eval": [2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "eval_func": [2, 16, 34], "eval_funct": 4, "evalu": [2, 16, 34], "even": [2, 5, 7, 33, 34], "evenli": 31, "everi": [2, 28], "exact": 2, "exactli": 21, "exampl": [2, 5, 7, 8, 13, 18, 19, 21, 22, 23, 24, 25, 28, 29, 32, 33, 34], "example_input": [2, 4, 6, 13, 15, 29, 32, 34], "example_kwarg_input": 2, "examplenet": 20, "examplenet1": 20, "examplenet2": 20, "exce": [26, 30, 33, 34], "except": [28, 31], "excess": 34, "excit": 34, "exclus": 31, "execut": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19, 20, 26, 31, 32, 33, 34], "exetens": 2, "exhibit": 30, "exist": [1, 5, 7, 13, 26, 31, 33], "exit": [6, 31], "exp": 13, "expect": [2, 7, 30, 34], "expecttest": 5, "expens": 18, "experi": [5, 7, 10, 12, 16, 18, 26, 33, 34], "experiment": 34, "explain": [17, 18, 21], "explicit": [18, 20, 33], "explicitli": [2, 8, 16, 20, 26, 31, 34], "explor": 2, "expon": 21, "export": [4, 31, 33], "expos": 8, "express": [18, 34], "ext": [6, 34], "extend": [1, 18, 25, 33, 34], "extens": [2, 3, 4, 6, 9, 10, 13, 14, 16, 17, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34], "extra": [2, 5, 10, 20, 31, 32], "extra_rope_config": 2, "extrem": [7, 14, 33], "f": [5, 6, 13, 16, 28, 34], "f1": 30, "f16c": 17, "f32": [17, 18], "f401": [6, 11, 12, 13, 16, 23, 29], "face": 3, "facebook": [3, 6, 28], "facilit": 34, "fact": [18, 33], "factor": [2, 6, 16, 31], "fail": [10, 26, 34], "failur": [12, 34], "fake": 2, "fake_quantize_per_tensor_affin": 8, "falcon": [2, 28, 34], "fall": [6, 12], "fals": [2, 4, 6, 7, 8, 13, 14, 15, 16, 17, 20, 22, 23, 26, 31, 32, 34], "famili": [2, 28, 33], "fashionmnist": 16, "fast": [4, 12, 33, 34], "fast_bert": [2, 4, 6, 7, 11, 34], "fast_layer_norm": [2, 34], "faster": [2, 6, 7, 8, 30, 33], "fastest": 17, "fastlayernorm": [2, 34], "fatal_error": 6, "favorit": 31, "fb": 34, "feasibl": 10, "featur": [0, 1, 2, 3, 5, 8, 10, 13, 14, 18, 20, 23, 25, 26, 28, 30, 31, 32, 33, 34], "feb": 3, "feed": [2, 9, 18], "feedback": 34, "feedforward": 28, "feel": [5, 18, 34], "few": [5, 7, 9, 13, 16, 18, 32, 34], "fewer": 21, "fft_fft": 8, "fft_fft2": 8, "fft_fftn": 8, "fft_hfft": 8, "fft_ifft": 8, "fft_ifft2": 8, "fft_ifftn": 8, "fft_ihfft": 8, "fft_irfft": 8, "fft_irfft2": 8, "fft_irfftn": 8, "fft_rfft": 8, "fft_rfft2": 8, "fft_rfftn": 8, "figur": [1, 2, 21, 28, 33], "file": [2, 4, 5, 6, 8, 14, 15, 16, 17, 18, 31, 34], "filenam": 5, "find": [1, 2, 7, 14, 16, 23, 26, 30, 31, 34], "find_packag": 6, "findavx": 17, "fine": [3, 20, 29, 31, 32, 33, 34], "finer": [1, 7, 20], "finish": [6, 11, 12, 13, 16, 20], "first": [2, 3, 5, 6, 7, 9, 10, 12, 16, 19, 20, 21, 26, 31, 32, 33], "firstli": [2, 28], "fit": [5, 7, 33, 34], "fix": [2, 5, 7, 34], "flag": [2, 5, 7, 17, 20, 31, 34], "flake8": 5, "flan": 28, "flash": 34, "flash_atten_varlen": 2, "flatten": [16, 20], "flexibl": 34, "float": [2, 6, 7, 8, 14, 15, 16, 17, 21, 29, 34], "float16": [2, 8], "float32": [2, 13, 21, 23, 26, 30, 31, 34], "float64": 8, "flourish": 28, "flow": 26, "flush": [6, 23], "fma": 17, "fn_type": 17, "focu": [2, 10, 18, 29, 34], "focus": [13, 34], "fold": [2, 10, 15, 16, 26, 34], "folder": 5, "follow": [1, 2, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34], "footbal": 7, "footprint": [7, 21, 28, 34], "forg": 33, "fork": [17, 33], "format": [2, 5, 6, 7, 9, 14, 22, 26, 28, 31, 33, 34], "format_tag": 18, "former": 6, "formerli": [30, 33, 34], "formula": 21, "forward": [2, 6, 8, 13, 16, 20, 21, 26, 32, 33, 34], "found": [1, 6, 7, 14, 16, 18, 29, 31, 32, 33, 34], "foundat": [18, 33], "fp16": [2, 6, 17, 29], "fp32": [2, 4, 16, 17, 19, 21, 23, 28, 34], "fp32_gw": 21, "fp32_w": 21, "fpn": 30, "fraction": 21, "fractional_max_pool2d": 8, "fractional_max_pool3d": 8, "fragment": 33, "framework": [5, 34], "free": [31, 34], "freez": [6, 8, 10, 13, 15, 16, 20, 23, 26, 32, 34], "freezed_model": [26, 34], "frequenc": [2, 30], "frequent": 7, "friendli": [7, 33], "from": [1, 2, 3, 4, 5, 8, 10, 11, 13, 15, 16, 17, 18, 19, 20, 21, 23, 25, 28, 29, 31, 32, 33, 34], "from_embeddingbag_list": 7, "from_pretrain": [4, 6, 11, 23, 29, 32], "front": [13, 34], "frontend": [1, 2, 7, 20, 28, 34], "frozenbatchnorm": 34, "frozenbatchnorm2d": 7, "fsi": 34, "fulfil": 20, "full": [2, 5, 18, 32, 33, 34], "fulli": [5, 15, 17, 21, 31, 33, 34], "function": [2, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 20, 21, 23, 26, 28, 29, 31, 33, 34], "further": [1, 2, 5, 6, 7, 18, 20, 28, 33, 34], "fuse": [2, 7, 13, 16, 19, 28, 34], "fuse_update_step": 2, "fusion": [1, 2, 7, 10, 21, 28, 34], "futur": [7, 28, 34], "futuretensor": 20, "fx": [3, 7, 10, 26, 34], "g": [2, 7, 8, 16, 17, 18, 28, 34], "gain": [1, 7, 26, 28, 34], "game": 7, "gave": 14, "gb": 20, "gcc": 17, "gcp": 3, "gelu": [2, 13, 34], "gemm": [7, 18, 26, 28, 34], "gen": [3, 30, 34], "gen_": 2, "gen_id": [6, 23], "gen_text": [6, 23], "genai": [1, 7, 28], "gender": 7, "gener": [1, 5, 6, 7, 10, 12, 16, 17, 18, 21, 23, 28, 29, 30, 31, 32, 33, 34], "generate_kwarg": [6, 23], "genv": 31, "geomean": 34, "geqrf": 8, "get": [1, 2, 3, 4, 6, 7, 10, 11, 15, 17, 20, 21, 22, 26, 28, 29, 30, 31, 33, 34], "get_acceler": 29, "get_core_list_of_node_id": 2, "get_cpp_typesize_and_vecs": 17, "get_cpp_typesize_and_vecsize_kernel_fn": 17, "get_cpp_typesize_and_vecsize_kernel_impl": 17, "get_cpp_typesize_and_vecsize_kernel_stub": 17, "get_smooth_quant_qconfig_map": [2, 6, 29], "get_weight_only_quant_qconfig_map": [2, 6, 29], "getattr": [6, 23], "getveclength": 17, "getveclengthkrnl": 17, "gif": 31, "gil": 20, "git": [2, 5, 28], "github": [1, 2, 5, 6, 7, 8, 34], "give": [32, 34], "given": [2, 6, 13, 14, 16, 28], "global": [2, 20, 22, 34], "global_past_key_valu": 6, "gnu": [6, 17, 32], "go": [2, 5, 8], "gomp_cpu_affin": 33, "good": [1, 2, 5, 7, 12, 18, 19, 28, 33, 34], "googl": [3, 5, 28], "gperftool": 33, "gpertool": 33, "gpt": [2, 28, 30], "gpt2": 26, "gptbigcod": [2, 28], "gptj": 2, "gptjforcausallm": 2, "gptq": [2, 6, 34], "gpu": [1, 3, 18, 34], "grad": [7, 19], "grad0": 19, "grad1": 19, "grad_i": 19, "grad_n": 19, "gradient": 7, "grain": [1, 3, 7, 20], "granular": [2, 31, 32, 33], "graph": [1, 4, 8, 10, 16, 23, 26, 31, 34], "graph_for": 13, "graph_mod": [2, 4, 7, 12, 34], "graphic": 33, "great": 33, "greater": 2, "greedi": [6, 23], "grid": 14, "grid_sampl": 8, "grokk": 3, "ground": 21, "group": [2, 19, 20, 33], "group_norm": 8, "group_siz": 2, "gru": 15, "grucel": 15, "gt": [4, 14, 28, 33], "gtest_filt": 5, "guid": [3, 6, 7, 17, 32, 34], "guidanc": 7, "guidelin": 18, "gw": 21, "h": [5, 6, 7, 16, 18, 26, 31, 32], "ha": [0, 1, 2, 7, 10, 14, 17, 18, 20, 21, 26, 28, 30, 31, 33, 34], "had": [6, 33], "half": [2, 7, 17, 21], "halv": 21, "handl": [6, 18, 33], "handler": 32, "hang": [33, 34], "happen": 7, "hard": [18, 26], "hardsigmoid": 34, "hardswish": [13, 34], "hardtanh": 13, "hardwar": [1, 3, 17, 25, 28, 32, 34], "hav": 17, "have": [1, 2, 5, 6, 7, 9, 14, 17, 18, 20, 21, 23, 26, 27, 28, 30, 31, 32, 33, 34], "head": [2, 34], "head_dim": 2, "head_map": 2, "head_mask": 2, "head_num": 2, "head_siz": 2, "header": 17, "heavi": 7, "heavier": 28, "height": 18, "hello": 5, "help": [2, 5, 6, 17, 23, 28, 31, 33, 34], "helper": 2, "here": [5, 8, 10, 13, 16, 17, 18, 20, 26, 32, 33, 34], "herebi": 16, "hero": 34, "heterogen": 34, "heurist": [2, 20, 34], "hf": [6, 28], "hf_beam_sampl": 34, "hf_beam_search": 34, "hf_greedy_search": 34, "hf_sampl": 34, "hidden": [2, 18, 28], "hidden_s": [2, 6], "hidden_st": 2, "high": [19, 21, 33], "higher": [2, 7, 13, 17, 18, 28], "higher_is_bett": 14, "highli": [7, 23, 28, 33, 34], "hinge_embedding_loss": 8, "hint": [2, 20], "histogram": [30, 34], "histogramobserv": [2, 15], "histori": [2, 14, 28], "hobbi": 7, "hold": [18, 33], "home": [31, 32], "homebrew": 5, "hood": 34, "hook": [10, 16], "hopefulli": 7, "host": [30, 34], "hostfil": 31, "hostnam": 31, "hotspot": 28, "how": [1, 2, 10, 15, 17, 18, 23, 28, 31, 32, 33, 34], "howev": [2, 5, 7, 8, 9, 16, 20, 26, 28, 31, 33, 34], "hp": 14, "hpc": 11, "html": [2, 5, 16], "http": [2, 5, 16, 34], "hub": 28, "huber_loss": 8, "hug": 3, "huge": [7, 14, 33], "hugginfac": 34, "huggingfac": [2, 6, 26, 28, 32, 34], "huggingface_transform": 32, "hurt": 20, "hw": 18, "hwc": 18, "hwio": 18, "hwn": 18, "hydra": 31, "hyper": [2, 30, 33, 34], "hyperparam": 14, "hyperparamet": [4, 7], "hyperparamt": 14, "hyperthread": 32, "hypertun": [4, 34], "hypertune_directori": 14, "hypervisor": 34, "hypothesi": 5, "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 26, 27, 28, 29, 30, 32, 33, 34], "i_mpi_pin_domain": 31, "iakv": [2, 28], "ic": 2, "ic_block": 2, "id": [2, 31, 32], "idea": [11, 21, 33], "ideep": [17, 18], "ident": [2, 10, 18], "identif": [6, 17], "identifi": 34, "idx": [2, 28, 31], "ieityuan": 28, "illeg": 34, "illustr": [18, 19, 21, 31, 33], "imag": [8, 13, 18, 33, 34], "image_classifi": 32, "imagenet": [18, 30], "immedi": 7, "immintrin": 17, "impact": [2, 7, 20], "imper": [20, 34], "impl": 17, "implement": [1, 5, 7, 11, 19, 26, 28, 33, 34], "implicit": 18, "implicitli": 6, "import": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 26, 28, 29, 32, 33, 34], "impract": [7, 14], "improv": [1, 3, 7, 8, 13, 20, 22, 28, 30, 32, 33], "in1": 7, "in2": 7, "in3": 7, "in_i": 7, "in_m": 7, "inaccur": 21, "inc": [16, 17, 22, 28], "includ": [1, 2, 5, 6, 7, 10, 14, 15, 17, 23, 26, 27, 28, 30, 34], "inclus": 33, "incorrect": [12, 26, 34], "increas": [1, 2, 3, 21, 26, 28, 30, 33, 34], "independ": 31, "index": [2, 5, 18, 28, 33], "index_copi": 8, "index_to_nam": 32, "indic": [2, 6, 18, 28], "indirect": 2, "indirect_access_kv_cache_attent": [2, 34], "indirectaccesskvcacheattent": [2, 34], "individu": [5, 30], "inductor": [7, 34], "inevit": 10, "inf": 14, "infer": [2, 3, 4, 7, 10, 11, 12, 15, 18, 20, 21, 23, 26, 30, 33, 34], "inferenc": 2, "inference2": 30, "inference3": 30, "inference_mod": [6, 23, 29], "influenc": [31, 33], "info": [2, 6, 17, 26, 31, 32, 34], "inform": [1, 2, 6, 7, 14, 17, 18, 28, 31, 32, 33, 34], "ingredi": 18, "init": [2, 5, 15, 34], "init_alpha": [16, 22], "init_distribut": 29, "init_infer": 29, "init_method": 6, "init_process_group": 6, "initi": [2, 20, 32], "inject": 34, "inlin": 17, "inplac": [2, 4, 6, 13, 15, 18, 23, 32], "input": [2, 6, 7, 9, 10, 13, 15, 16, 17, 18, 22, 23, 26, 29, 30, 32, 33, 34], "input1": 10, "input_channel": 2, "input_hint": 20, "input_id": [6, 23], "input_ids_pad": 6, "input_s": [6, 23], "input_split_hint": [2, 20], "input_tokens_length": [6, 23], "inputpath": 32, "insert": [2, 16], "insid": [2, 5, 20, 31], "inspir": 34, "instal": [4, 5, 6, 23, 25, 26, 28, 33, 34], "instanc": [2, 7, 10, 14, 32, 34], "instance_idx": 31, "instancenorm": 34, "instanti": 6, "instead": [7, 8, 14, 19, 20, 29, 30, 31, 32, 33, 34], "instruct": [1, 2, 5, 6, 7, 8, 17, 21, 23, 24, 25, 28, 30, 33, 34], "int": [2, 6, 7, 14, 17, 23, 26, 29, 31, 34], "int4": [2, 28, 29, 34], "int8": [1, 2, 3, 4, 17, 18, 20, 22, 28, 29, 34], "int8_qconfig": 6, "integ": [28, 31, 33], "integr": [7, 18, 28, 33, 34], "intel": [2, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 17, 20, 21, 22, 23, 25, 26, 27, 28, 29, 34], "intel discrete gpu": 1, "intel optim": 1, "intel_extension_for_pytorch": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 20, 23, 25, 29, 32, 34], "intel_pytorch_extens": [7, 25, 26, 34], "intel\u00ae extension for pytorch*": 1, "intend": 5, "intent": 5, "interact": [7, 34], "interconnect": 33, "interest": 5, "interfac": [5, 6, 18, 26, 28], "intern": [17, 18, 20, 32], "interpret": 31, "interrupt": 32, "intervent": 8, "intra": 2, "intrins": 17, "introduc": [1, 3, 7, 15, 18, 21, 22, 31, 33, 34], "introduct": [0, 2, 7, 28, 33, 34], "invalid": 33, "invers": 8, "investig": [2, 31], "invoc": [1, 7], "invok": [2, 6, 8, 10, 13, 20, 23, 26, 29, 34], "involv": 21, "io": 28, "iostream": 6, "ip": 31, "ipex": [1, 2, 3, 4, 6, 7, 9, 11, 12, 13, 15, 16, 17, 19, 20, 23, 26, 29, 31, 32, 34], "ipex_declare_dispatch": 17, "ipex_define_dispatch": 17, "ipex_en": 32, "ipex_fus": 2, "ipex_register_dispatch": 17, "ipexconfig": 6, "ipexrun": [4, 10, 31, 34], "is_caus": 2, "is_contigu": 18, "is_cus": 2, "is_dynam": [6, 15], "is_hyperthreading_en": 14, "is_runtime_ext_en": 2, "isa": [1, 34], "isa_codegen": 17, "isa_nam": 17, "isacodegen": 17, "issu": [1, 2, 5, 8, 21, 26, 33], "ital": 32, "item": 16, "iter": [2, 16, 21, 28, 34], "its": [2, 6, 7, 8, 14, 17, 21, 28, 30, 31, 32, 33, 34], "itself": [2, 5, 18], "ivalu": 6, "j": [2, 5, 17, 28, 30], "jan": 3, "je": 14, "jemalloc": [30, 32, 34], "jemallocl": 31, "jit": [1, 2, 5, 6, 7, 8, 13, 15, 16, 18, 20, 23, 26, 32, 34], "job": 5, "join": 33, "joint": 34, "joint_net": [26, 34], "json": [2, 6, 15, 16, 32, 34], "jul": 3, "jun": 3, "jupyt": 5, "just": [2, 14, 29, 34], "k": [2, 5], "kcpu": 17, "keep": [5, 12, 18, 21, 28, 32, 33, 34], "kei": [2, 7, 28, 34], "kept": 21, "kernel": [1, 2, 7, 20, 26, 28, 30, 33, 34], "kernel_s": 10, "key_cach": 2, "key_token": 2, "keystrok": 5, "keytensor": 2, "keyword": 2, "kill": 32, "kind": 7, "kineto_librari": 6, "kl_div": 8, "kmp": [31, 33], "kmp_": 20, "kmp_affin": [31, 32, 33], "kmp_blocktim": [31, 32, 33], "knob": [2, 4, 12, 31], "know": 5, "knowledg": 33, "known": [6, 10, 28], "kt": 3, "kv": 2, "kv_cach": [2, 28], "kwarg": [2, 29], "l1318": 2, "l1_loss": 8, "l2": 33, "l23": 2, "l4": 2, "l50": 2, "l76": 2, "label": 8, "lake": [7, 30, 34], "lamb": [19, 21], "land": [7, 34], "landscap": [1, 7, 28], "languag": [1, 2, 23, 24, 25, 26, 29, 34], "lar": 34, "larg": [1, 2, 19, 23, 24, 25, 26, 29, 30, 33, 34], "larger": [2, 20, 30, 31, 33, 34], "last": [3, 10, 21, 26, 34], "last_ind": 6, "latenc": [3, 14, 18, 28, 30, 32, 34], "later": [2, 7, 25, 33], "latest": [1, 2, 25, 28, 30, 34], "launch": [4, 6, 20, 32, 34], "launcher": [7, 13, 31, 33, 34], "law": 7, "layer": [2, 16, 20, 22, 28, 34], "layer_past": 2, "layernorm": [2, 13, 16, 22, 34], "layernorm_modul": 2, "layout": [2, 26, 34], "lazi": 5, "ld": 31, "ld_preload": [20, 31, 32, 33], "ldd": 6, "lead": 28, "leaki": 13, "leaky_relu": 13, "leakyrelu": 34, "learn": [3, 7, 8, 11, 13, 14, 21, 31, 33], "learning_r": [10, 34], "leav": [2, 20, 33], "left": [21, 28, 32], "legal": 34, "legend": 28, "len": [2, 6, 7, 13, 16, 17], "length": [2, 5, 14, 21, 26, 30, 34], "less": [2, 8, 18, 20, 26, 34], "let": [5, 10, 18, 19, 20, 21], "level": [7, 10, 13, 16, 18, 20, 21, 26, 33, 34], "leverag": [1, 7, 11, 28, 32, 34], "lib": [6, 31, 32], "lib64": [31, 32], "libc10": 6, "libdnnl_graph": 6, "libgomp": 33, "libintel": [6, 34], "libiomp": 33, "libiomp5": [20, 31, 32, 33], "libjemalloc": 31, "libpytorch_path": 6, "librari": [1, 2, 5, 6, 7, 17, 20, 32, 33, 34], "libtcmalloc": [31, 32], "libtorch": [6, 34], "libtorch_cpu": 6, "libxsmm": 2, "licens": 17, "lighter": 8, "like": [1, 2, 3, 5, 6, 7, 8, 14, 18, 19, 21, 26, 28, 31, 33, 34], "limit": [5, 8, 10, 20, 26, 32, 33, 34], "linalg_choleski": 8, "linalg_cholesky_ex": 8, "linalg_cond": 8, "linalg_eig": 8, "linalg_eigh": 8, "linalg_eigv": 8, "linalg_eigvalsh": 8, "linalg_householder_product": 8, "linalg_inv": 8, "linalg_inv_ex": 8, "linalg_lstsq": 8, "linalg_matrix_rank": 8, "linalg_qr": 8, "linalg_solv": 8, "linalg_svd": 8, "linalg_svdv": 8, "linalg_tensorinv": 8, "linalg_tensorsolv": 8, "line": [5, 10, 13, 18, 31, 32, 33], "linear": [2, 6, 7, 8, 13, 15, 16, 18, 26, 33, 34], "linear2silumul": [2, 34], "linear_": 2, "linear_bn": 2, "linear_bn_fold": 2, "linear_m": 2, "linear_m_modul": 2, "linear_modul": 2, "linear_relu_stack": 16, "linear_s_modul": 2, "linearadd": [2, 34], "linearaddadd": [2, 34], "lineargelu": [2, 34], "linearize_indices_and_offset": 7, "linearmul": [2, 34], "linearnewgelu": [2, 34], "linearrelu": [2, 34], "linearsilu": [2, 34], "linearsilumul": [2, 34], "link": [1, 6, 17, 34], "linux": [5, 6, 17, 30, 31, 33], "list": [2, 5, 7, 8, 13, 14, 16, 18, 25, 29, 31, 32, 33, 34], "liuhaotian": 28, "live": 5, "ll": [5, 32, 33], "llama": [2, 3, 6, 28, 34], "llama2": [30, 34], "llama3": 34, "llava": [2, 28], "llm": [1, 16, 22, 24, 25, 34], "load": [1, 2, 6, 7, 13, 15, 16, 17, 23, 29, 32, 34], "load_dataset": 6, "load_qconf_summari": 15, "load_state_dict": [2, 34], "loader": 16, "local": [6, 20, 28, 31, 32, 33], "locat": [5, 17, 34], "log": [4, 6, 13, 31, 32, 34], "logic": [2, 14, 18, 32, 33], "login": 6, "logit": 16, "long": [2, 6, 18, 21, 26, 28, 34], "long_factor": 2, "longer": [26, 30, 34], "longform": 26, "look": [5, 14, 16, 18], "loop": [5, 21, 29], "lose": 21, "loss": [2, 5, 6, 8, 16, 18, 21, 26], "loss_fn": 16, "lot": [28, 34], "low": [3, 4, 6, 7, 21, 23, 31, 33, 34], "low_cpu_mem_usag": [6, 23], "low_precision_checkpoint": [2, 6, 29], "lower": [2, 8, 17, 21, 28, 34], "lowest": 2, "lowp": [2, 6], "lowp_mod": [2, 6, 29], "lr": [6, 7, 8, 16, 19], "lr_decai": 19, "lsb": 17, "lscpu": 33, "lstm": [2, 10, 15, 34], "lstmcell": 15, "lstsq": 8, "lt": [4, 28, 30], "lu_solv": 8, "m": [4, 14, 20, 26, 31, 32, 33, 34], "m6i": [30, 32], "m7i": 30, "machin": [3, 5, 6, 7, 14, 17, 26, 31, 32, 33, 34], "maco": 5, "macro": 17, "made": [5, 34], "mai": [1, 2, 3, 5, 6, 7, 8, 9, 16, 17, 18, 20, 26, 28, 31, 32, 33, 34], "main": [1, 2, 5, 6, 14, 20, 31, 32], "mainli": [31, 34], "maintain": 8, "major": 16, "make": [2, 5, 6, 7, 14, 15, 17, 21, 23, 28, 32, 33], "make_tupl": 17, "makefil": 5, "malloc": [14, 31, 33], "malloc_conf": [31, 33], "mamx": 17, "man": [7, 33], "manag": [2, 8, 13, 20, 28, 31], "mandatori": 14, "mani": [5, 14, 28, 31, 33, 34], "manipul": 18, "mantissa": 21, "manual": [2, 7, 10, 14, 18, 20, 34], "manual_se": [6, 11], "map": [2, 6, 18, 30], "mar": [3, 32], "margin_ranking_loss": 8, "mask": [2, 7, 17, 26], "mask_valu": 17, "maskrcnn": [33, 34], "maskrnn": 34, "master": [2, 7, 21, 31], "master_addr": 6, "master_port": 6, "match": [2, 8, 17, 31], "math": 7, "matmul": [2, 8, 13, 26, 34], "matrix": [1, 6, 7, 25, 28], "matrix_rank": 8, "matur": 34, "mavx2": 17, "mavx512bf16": 17, "mavx512bw": 17, "mavx512dq": 17, "mavx512f": 17, "mavx512fp16": 17, "mavx512vl": 17, "mavx512vnni": 17, "max": [2, 6, 16, 17, 22, 23, 26, 34], "max_context_len": 2, "max_new_token": [6, 23], "max_num_blocks_per_seq": 2, "max_position_embed": 2, "max_seq": 2, "max_seq_len": 30, "max_seq_length": [10, 34], "max_seqlen_k": 2, "max_seqlen_kv": 2, "max_seqlen_q": 2, "max_trial": 14, "max_unpool2d": 8, "max_unpool3d": 8, "maxim": 14, "maximum": [2, 16, 17], "maxpool": 34, "maxpool2d": 13, "maycontainalia": 5, "md": 18, "me": 18, "mean": [2, 16, 17, 18, 20, 22, 28, 34], "meant": 34, "meanwhil": [12, 33, 34], "measur": [30, 34], "mechan": [1, 7, 17, 21, 34], "medium": 28, "meet": [21, 33, 34], "meltdown": 30, "membind": 33, "memori": [2, 6, 7, 8, 9, 10, 13, 19, 20, 21, 26, 28, 30, 32, 34], "memory_format": [6, 7, 18, 23], "mention": [3, 10, 20, 21, 34], "merg": [0, 7, 34], "merged_emb": 7, "merged_input": 7, "mergedembeddingbag": 7, "mergedembeddingbagwith": 7, "mergedembeddingbagwithsgd": 7, "merit": 18, "mermori": 2, "messag": [2, 6, 10, 12, 18, 31], "meta": [6, 18, 28, 29, 34], "metadata_thp": [31, 33], "method": [2, 8, 15, 16, 18, 22, 26, 33, 34], "method1": 10, "method2": 10, "methodologi": [2, 6, 7, 19, 33], "methond": 15, "metric": [2, 16, 30], "mfma": 17, "mha": [2, 34], "mhz": 33, "microarchitectur": 33, "microsoft": [2, 28], "might": [2, 7, 18, 26, 33, 34], "migrat": 7, "millisecond": 33, "min": [2, 16, 22, 26, 34], "mind": [18, 32], "mini": [2, 20, 28, 34], "minim": [7, 14, 17, 33], "minimum": [14, 16, 18], "minmax": 34, "minmaxobserv": [2, 6, 15], "misc": 34, "mish": 13, "miss": 5, "mistral": [2, 28, 34], "mistralai": 28, "mitig": [20, 30], "mix": [2, 6, 13, 23, 26, 28, 34], "mixed_dtyp": 34, "mixtral": [2, 28], "mixtur": [8, 34], "mkdir": 6, "mkl": 34, "mkldnn": 18, "mkldnn_util": 18, "mllama": 2, "mlp": 34, "mm": 8, "mmuzzy_decay_m": 33, "mmx": 17, "mno": 17, "mobilenet": 30, "mode": [1, 2, 5, 7, 10, 12, 18, 20, 23, 26, 32, 34], "model": [1, 2, 3, 4, 8, 9, 10, 11, 12, 14, 16, 23, 24, 25, 26, 29, 30, 33, 34], "model1": 20, "model2": 20, "model_execut": 34, "model_id": [6, 23], "model_log": 32, "model_name_or_path": [10, 29, 34], "model_script": 20, "model_service_work": 32, "model_state_dict": 6, "model_stor": 32, "model_to_be_calibr": 34, "modelfamili": 28, "modeling_llama": 2, "modelurl": 32, "modern": 3, "modifi": [2, 5, 6], "modul": [1, 6, 7, 8, 13, 16, 17, 26, 29, 31, 34], "modular": 2, "modulist": 7, "momentum": [6, 10, 21], "monkei": 10, "more": [1, 2, 5, 6, 7, 8, 10, 11, 13, 16, 17, 19, 20, 21, 23, 26, 28, 32, 33, 34], "moreov": [1, 2, 28], "mosaicml": 28, "most": [2, 6, 7, 13, 21, 28, 30, 32, 33, 34], "motherboard": 33, "motiv": [2, 20], "move": [18, 33], "movingaverageminmax": 34, "mp_size": 29, "mpi": 31, "mpiexec": 31, "mpt": [2, 28, 34], "mrpc": 30, "mse_loss": 8, "much": [15, 18, 21, 28, 31, 33], "mul": [2, 13, 16], "multi": [2, 7, 14, 20, 28, 31, 33, 34], "multi_margin_loss": 8, "multi_stream": 2, "multi_stream_input_hint": 34, "multi_stream_model": [20, 34], "multi_stream_output_hint": 34, "multidimension": 18, "multiheadattent": 28, "multilabel_margin_loss": 8, "multilabel_margin_loss_forward": 8, "multipl": [2, 5, 7, 8, 16, 17, 18, 26, 28, 30, 32, 33, 34], "multipli": 2, "multistreammodul": [2, 7, 20, 26, 34], "multistreammodulehint": [2, 20, 34], "multithread": 33, "must": [2, 5, 14, 17, 19], "mutual": 31, "muzzy_decay_m": [31, 33], "my": 18, "mykernel": 17, "mymodel": 34, "mypi": 5, "n": [2, 6, 7, 16, 18, 19, 20, 26, 32, 33, 34], "n1": 18, "n2": 18, "n_iter": 32, "name": [2, 5, 7, 14, 17, 25, 28, 31, 32, 33, 34], "namespac": [8, 17], "nan": [17, 34], "nanquantil": 8, "narg": 6, "narrow": 5, "nativ": [1, 6, 7, 8, 17, 19, 21, 26, 28, 34], "natur": [18, 21, 28], "naver": 3, "nb": 18, "nc": 32, "nchw": [7, 33], "ncore": [10, 31], "ncore_per_inst": [14, 34], "ncores_per_inst": 14, "nd": 18, "necessari": 18, "necessarili": 2, "neck": 19, "need": [2, 5, 6, 7, 10, 13, 14, 16, 17, 18, 19, 20, 21, 23, 26, 29, 31, 32, 33, 34], "need_linearize_indices_and_offset": 7, "neelnanda": 6, "neg": 21, "neglig": 18, "neighbor": 2, "neox": [2, 28], "net": 34, "network": [1, 3, 7, 8, 20, 25, 28, 33], "neural": [1, 3, 7, 16, 22, 25, 28, 33, 34], "neuralnetwork": 16, "new": [3, 5, 12, 16, 17, 18, 20, 23, 26, 29, 33], "new_gelu": 2, "new_layer_past": 2, "newer": [1, 28, 33], "newgeluactiv": 2, "newkernel": 17, "newkernelkrnl": 17, "newli": 34, "newlin": 5, "next": [5, 7, 34], "nf4": [2, 29], "nhwc": [7, 33, 34], "nifti": 33, "ninstanc": [10, 14, 31, 34], "nint": 5, "nll_loss": 8, "nll_loss2d": 8, "nlp": [6, 7, 26, 30, 34], "nm": [7, 34], "nn": [2, 6, 7, 8, 10, 13, 15, 16, 18, 20, 26, 34], "nnc": 26, "nnode": 31, "no_grad": [4, 6, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "node": [2, 20, 30, 32, 33, 34], "node0": 33, "node1": 33, "node_id": [2, 20, 31, 32, 34], "non": [2, 5, 8, 13, 18, 30, 32, 34], "noncontigu": 18, "none": [2, 6, 29, 31], "noqa": [6, 11, 12, 13, 16, 23, 29], "normal": [1, 2, 6, 7, 13, 20, 28, 33, 34], "normalized_shap": 2, "note": [2, 3, 5, 6, 15, 16, 17, 18, 20, 22, 24, 28, 30, 31, 32, 33], "notfound": 6, "noth": 2, "notic": [27, 31, 32], "nov": 3, "now": [2, 7, 15, 18, 32, 33, 34], "np": [16, 31], "nproc": 31, "nth": [32, 33], "num": [2, 20, 32, 33, 34], "num_attention_head": 6, "num_beam": [6, 23], "num_block": 2, "num_featur": 7, "num_head": 2, "num_hidden_lay": 6, "num_kv_head": 2, "num_nod": 14, "num_seq": 2, "num_stream": [2, 20, 34], "num_token": 2, "num_train_epoch": [10, 34], "numa": [2, 20, 31, 32, 34], "numactl": [20, 31, 32], "number": [1, 2, 5, 6, 7, 14, 16, 19, 20, 21, 26, 32, 34], "numer": [2, 8, 33], "numpi": 16, "o": [6, 17, 23, 30], "o0": [2, 26, 34], "o1": [2, 26, 34], "o3": 17, "object": [2, 6, 7, 14, 17, 20, 33, 34], "observ": [2, 9, 13, 15, 34], "obsev": 15, "obtain": 16, "obviou": 28, "occupi": 26, "occur": 34, "occurr": 28, "off": [7, 8, 21, 28, 30, 34], "offer": [1, 5, 33], "offici": [5, 32, 33, 34], "offlin": 34, "offset": [2, 18, 28], "often": 7, "old": 34, "omp": [20, 26, 31, 32, 33, 34], "omp_num_threa": 26, "omp_num_thread": [20, 26, 31, 32, 34], "omp_proc_bind": [31, 33], "omp_schedul": [31, 33], "omp_set_num_thread": 34, "onboard": [19, 33], "onc": [2, 5, 6, 14, 17, 18, 20, 21, 32, 33], "ondevic": 29, "one": [2, 5, 7, 12, 13, 14, 16, 18, 19, 20, 26, 29, 31, 33, 34], "oneapi": [6, 33], "oneccl": [3, 6, 31, 34], "oneccl_bindings_for_pytorch": 6, "onednn": [2, 3, 13, 17, 26, 28, 34], "onednn_primitive_cache_capac": 33, "onednn_verbos": 4, "ones": [2, 6, 17], "onli": [1, 2, 5, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 26, 28, 31, 32, 34], "onlyquantizationint4": 28, "onlyquantizationint8": 28, "oob": [10, 34], "op": [2, 7, 15, 16, 22, 28, 34], "op_type_dict": 2, "open": [1, 16, 28, 33], "openai": 28, "openmp": [2, 7, 20, 26, 30, 32, 34], "oper": [1, 2, 6, 8, 13, 15, 21, 32, 33, 34], "opportunit": 2, "opt": [2, 6, 17, 28], "optdecoderlay": 16, "optim": [1, 3, 4, 6, 8, 9, 11, 12, 14, 16, 18, 20, 21, 23, 25, 26, 31, 32, 33, 34], "optimize_lstm": 2, "optimize_transform": 34, "optimized_model": [2, 34], "optimized_optim": 2, "optimizer_state_dict": 6, "optimum": 10, "optin": 2, "option": [1, 2, 5, 7, 10, 14, 15, 16, 29, 31, 34], "optyp": 2, "order": [2, 17, 18, 21, 31, 33, 34], "org": [2, 7, 16, 26, 34], "organ": 18, "orgqr": 8, "origin": [2, 6, 7, 12, 13, 15, 17, 20, 29, 34], "original_max_position_embed": 2, "original_model": 2, "ormqr": 8, "other": [2, 6, 7, 8, 14, 17, 18, 19, 23, 28, 31, 33], "other_1": 2, "other_2": 2, "other_arg": 19, "otheriws": 13, "otherwis": [2, 7, 20], "our": [5, 16, 19, 28, 33, 34], "out": [2, 5, 6, 7, 8, 10, 13, 16, 19, 20, 30, 31, 33, 34], "outlier": [7, 16], "outplac": [18, 34], "output": [2, 6, 7, 8, 13, 14, 16, 18, 23, 26, 34], "output_concat_hint": [2, 20], "output_dir": [10, 14, 34], "output_hint": 20, "output_tokens_length": [6, 23], "outsid": 20, "outstand": 5, "over": [5, 7, 8, 9, 16, 18, 30, 31, 34], "overal": 33, "overflow": [26, 34], "overhead": [1, 2, 7, 10, 19, 20, 26, 28, 33, 34], "overlap": 32, "overrid": 15, "overridden": [2, 17], "oversize_threshold": [31, 33], "overview": [7, 25, 34], "overwrit": [2, 31], "own": [2, 6, 15, 28], "owner": 13, "p29": 30, "p90": 30, "pack": [2, 20, 34], "packag": [1, 2, 5, 6, 7, 10, 23, 25, 26, 32, 33, 34], "pad": [8, 10, 20, 34], "pad_max": 6, "pad_val": 6, "padding_mod": 34, "page": [2, 6, 13, 20, 24, 29, 30, 33, 34], "pagedattent": [2, 34], "paper": [2, 34], "parallel": [2, 5, 6, 7, 28, 33, 34], "param": [2, 19, 31], "param_i": 19, "param_n": 19, "paramet": [2, 6, 7, 8, 10, 16, 17, 19, 20, 21, 26, 28, 29, 30, 31, 33, 34], "parse_arg": [6, 23], "parser": [6, 23], "part": [3, 5, 7, 8, 18, 21, 26, 31, 33, 34], "parti": 34, "partial": 7, "particular": [5, 6, 8, 29, 34], "partit": [13, 33], "pass": [1, 2, 5, 10, 17, 20, 26, 32, 34], "past": 28, "past_key_valu": [2, 6], "past_kv_length": 2, "patch": [10, 34], "path": [2, 6, 7, 14, 18, 20, 23, 31, 33, 34], "pattern": [7, 11, 18, 28, 34], "pdf": 2, "pdropout": 2, "peak": [2, 7, 11, 34], "penal": 33, "pend": 34, "per": [2, 10, 15, 16, 20, 30, 31, 32, 33, 34], "per_batch": 2, "per_batch_ic_block": 2, "per_batch_ic_block_sym": 2, "per_channel_symmetr": [2, 6, 15], "per_device_train_batch_s": [10, 34], "per_ic_block": 2, "per_tensor": 2, "per_tensor_affin": [6, 15, 34], "per_tensor_symmetr": 15, "perchannelminmaxobserv": [2, 6, 15], "perf": [11, 18], "perfect": 28, "perform": [1, 2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 15, 16, 18, 19, 21, 25, 28, 29, 31], "period": 33, "person": 3, "perspect": [2, 13, 18, 21, 28, 31, 33], "pertain": 17, "phase": [2, 20], "phi": [2, 28, 34], "physic": [2, 14, 20, 32, 33], "pick": 5, "piec": [2, 20], "pile": 6, "pin": [2, 20], "pinvers": 8, "pip": [4, 5, 33, 34], "pip3": 34, "place": [2, 8, 28, 33, 34], "placeholderobserv": [6, 15], "placement": 33, "plai": [7, 33], "plan": [5, 7, 10], "platform": [3, 7, 18, 32, 33, 34], "platinum": [14, 30, 32, 33], "pleas": [2, 6, 7, 11, 16, 22, 26, 28, 29, 31, 33, 34], "plu": 33, "pmi_rank": 6, "pmi_siz": [6, 29], "point": [2, 6, 8, 15, 21, 33, 34], "pointer": 17, "poisson_nll_loss": 8, "polar": 8, "polici": 33, "polish": 34, "polymorph": 17, "pool": [2, 20, 34], "poor": [26, 34], "popular": [1, 7, 22, 28, 30, 34], "popup": 5, "port": 31, "portabl": 11, "portion": 16, "pos_embd_dim": 2, "posit": [2, 28, 33, 34], "position_id": [2, 6], "position_ids_pad": 6, "possibl": [2, 14, 15, 19, 28, 31, 33, 34], "post": [2, 4, 5, 7, 15, 28, 34], "potenti": [3, 7, 34], "pow": 13, "power": [2, 7, 33, 34], "ppn": 31, "pr": [7, 18, 34], "practic": [6, 21, 24, 28, 33], "pragma": 17, "pre": [2, 28, 34], "precis": [2, 4, 6, 13, 21, 23, 26, 30, 34], "pred": 16, "predefin": 2, "predict": 16, "prefer": [1, 7, 8, 15, 24], "prefetchw": 17, "prefetchwt1": 17, "prefil": 2, "prefix": 31, "preload": [2, 31], "prepack": [2, 6, 10, 18, 26, 34], "prepar": [2, 4, 6, 13, 16, 26, 29, 32, 34], "prepared_model": [2, 4, 6, 13, 15, 16, 26, 29, 34], "prerequisit": [5, 6], "present": 32, "pretrain": [6, 32, 34], "pretti": 33, "prevent": 19, "previou": [14, 16, 18, 33, 34], "previous": 32, "primari": 33, "primarili": [8, 34], "primit": [11, 20, 30, 34], "principl": [3, 18], "print": [6, 11, 12, 13, 14, 16, 17, 23, 31], "printf": 5, "prior": [2, 23], "privat": 34, "probabl": 2, "problem": [7, 19, 26, 32, 33], "proc": 31, "procedur": 32, "process": [2, 6, 7, 11, 12, 14, 16, 19, 20, 21, 26, 31, 32, 33], "processor": [3, 7, 19, 21, 28, 30, 33, 34], "proclist": 33, "prod": 8, "produc": [5, 8], "product": [1, 2, 7, 14, 28, 34], "program": [1, 5, 7, 11, 20, 31, 33, 34], "progress": [26, 28, 34], "project": [1, 6], "prompt": [4, 6, 23, 34], "propag": [13, 21, 33], "proper": 34, "properli": 31, "properti": [6, 32], "propos": [5, 7, 11, 16, 18, 21], "prototyp": [4, 13, 20, 26, 34], "provid": [1, 2, 5, 6, 7, 8, 11, 12, 13, 14, 16, 20, 22, 24, 26, 28, 29, 31, 32, 33, 34], "pseudo": [19, 21, 34], "pseudocod": [26, 34], "pt": [6, 13, 14, 15, 23, 32, 34], "pth": 6, "pthread": 20, "ptmalloc": 32, "ptq": 7, "public": 34, "pull": 5, "purlei": 33, "purpos": [17, 31, 32, 33], "push": 34, "push_back": 6, "put": 33, "py": [2, 5, 10, 14, 20, 31, 32, 34], "pyg": 3, "pyi": 5, "pypi": [26, 34], "python": [1, 2, 4, 10, 14, 17, 20, 26, 28, 29, 31, 32, 33, 34], "pytorch": [2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 16, 17, 20, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34], "q": [2, 28], "qa": [10, 34], "qconf_summari": [6, 15, 16, 29], "qconfig": [2, 4, 6, 13, 16, 26, 29, 32, 34], "qconfig_map": 6, "qconfig_summary_fil": [2, 6, 29], "qconfig_summary_file_path": 29, "qconfigmap": 6, "qint8": [2, 6, 15], "qkv": 34, "qparam": 15, "qr": 8, "qscheme": [2, 6, 15, 34], "qualiti": 34, "quant": [2, 16], "quant_method": 2, "quant_stat": 15, "quantconf": 34, "quantil": 8, "quantiz": [1, 3, 4, 13, 22, 26, 28, 30, 32, 34], "quantizat": 2, "quantization_config": [2, 6, 29], "quantize_per_tensor": 26, "quantized_model": [13, 15, 34], "queri": [2, 17, 18], "query_roteri": 2, "query_token": 2, "question": [18, 30], "quick": [1, 20, 24, 25], "quick_check": 5, "quickli": 2, "quicklint": 5, "quickstart_tutori": 16, "quint8": [6, 15], "quit": [17, 34], "qwen": [2, 28, 34], "qwen2": [28, 34], "r": [5, 6, 7, 14, 23, 30, 32, 33], "rais": [2, 10], "rand": [6, 8, 12, 13, 20, 26, 34], "randint": [6, 11, 32], "randn": [2, 10, 13, 16, 18, 32, 34], "random": 14, "rang": [1, 6, 7, 15, 16, 19, 21, 26, 31, 32, 34], "rank": [6, 31, 34], "rapid": 3, "rate": 21, "rather": [2, 18], "ratio": [22, 30, 34], "raw": 2, "rc": 34, "rc3": 34, "re": [5, 8, 32, 33, 34], "reach": 34, "read": [7, 19], "readm": 34, "real": [2, 7, 14, 15, 30, 34], "realli": 5, "realtim": 30, "reason": [2, 10, 18, 20, 34], "rebas": [5, 34], "receip": [16, 20], "receipt": 20, "receiv": 21, "recent": [6, 7, 18], "recip": [2, 4, 7, 13, 15, 26, 28, 34], "recognit": 33, "recommend": [1, 5, 6, 7, 9, 10, 15, 16, 20, 23, 30, 31, 33, 34], "record": [14, 32], "recov": 21, "recurs": 5, "reduc": [1, 2, 7, 15, 19, 20, 21, 22, 26, 28, 33, 34], "reduce_rang": 15, "reduct": 34, "refer": [1, 7, 9, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 32, 34], "refin": 34, "reflection_pad1d": 8, "reflection_pad2d": 8, "regard": 13, "regardless": [8, 34], "region": [2, 8, 17, 33], "regist": [1, 7, 10, 16, 17, 34], "registr": 7, "regress": [9, 34], "regular": [6, 21], "reinstal": [5, 26], "reinterpret": 18, "reinterpret_cast": 17, "rel": [2, 4, 16, 31, 34], "relat": [2, 6, 13, 17, 31, 33, 34], "releas": [1, 17, 18, 26, 30, 33], "reli": [18, 20], "relu": [2, 7, 13, 16, 18, 26, 34], "relu6": 34, "remain": 32, "remaind": [2, 20], "remark": [26, 30, 33], "remot": 33, "remov": [2, 5, 21, 34], "reorder": [2, 18, 28], "reorder_cach": 28, "repeat": [10, 18, 21], "repeatedli": 5, "replac": [2, 5, 7, 10, 26, 34], "replace_dropout_with_ident": 2, "replication_pad1d": 8, "replication_pad2d": 8, "replication_pad3d": 8, "repo": [5, 6, 7], "repo_root": 29, "report": [1, 17], "repres": [5, 7, 21], "represent": 18, "reproduc": 32, "request": [1, 5, 20, 32], "requir": [2, 5, 6, 8, 10, 16, 18, 21, 26, 28, 29, 31, 32, 34], "research": 28, "reserv": 33, "reshape_and_cach": 2, "residu": 31, "resiz": [6, 13], "resnet18": 34, "resnet18_xpu": 34, "resnet34": [30, 34], "resnet3d": 34, "resnet50": [12, 13, 14, 18, 30, 31, 33, 34], "resnet50_weight": [6, 12, 13], "resnext": 30, "resnext101": [18, 34], "resnext3d": 34, "resolv": 34, "resourc": [13, 20, 28, 32, 33], "respect": [14, 16, 30, 31, 34], "respons": 30, "rest": 32, "restart": 32, "result": [1, 2, 6, 10, 12, 14, 16, 18, 20, 21, 30, 31, 32, 33], "retinanet": 34, "retriev": 33, "return": [2, 6, 7, 8, 10, 16, 17, 20, 26, 34], "return_softmax": 2, "return_tensor": [6, 23], "reus": [2, 33], "review": [7, 34], "rf": 5, "rfc": 18, "rh": 17, "right": [7, 21, 23, 28], "risk": 34, "rm": 5, "rms_norm": [2, 34], "rmsnorm": [2, 28, 34], "rmsnorm_modul": 2, "rn50": [13, 34], "rn50_int8_jit": 32, "rn50_ipex_int8": 32, "rnn": 34, "rnncell": 15, "rnnt": [26, 34], "ro": 2, "roberta": [26, 34], "roialign": [7, 34], "role": 33, "root": [6, 13, 16, 17, 28], "rope": [28, 34], "rope_modul": 2, "rotari": [2, 28], "rotary_dim": 2, "rotary_embed": [2, 34], "rotary_half": 2, "rotary_ndim": 2, "rotaryembed": [2, 34], "roughli": 18, "round": [13, 21], "rounding_bia": 17, "row": 7, "rst": 5, "rule": [21, 34], "run": [2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 26, 30, 31, 32, 33, 34], "run_20210712212258_inst": 31, "run_20210712212258_instance_0_cores_0": 31, "run_20210712214504_inst": 31, "run_20210712214504_instance_0_cores_22": 31, "run_20210712220928_inst": 31, "run_20210712220928_instance_0_cores_0": 31, "run_20210712221150_inst": 31, "run_20210712221150_instance_0_cores_0": 31, "run_20210712221150_instance_1_cores_22": 31, "run_20210712221305_inst": 31, "run_20210712221305_instance_0_cores_0": 31, "run_20210712221305_instance_1_cores_11": 31, "run_20210712221305_instance_2_cores_22": 31, "run_20210712221305_instance_3_cores_33": 31, "run_20210712221415_inst": 31, "run_20210712221415_instance_0_cores_0": 31, "run_20210712221415_instance_10_cores_40": 31, "run_20210712221415_instance_1_cores_4": 31, "run_20210712221415_instance_2_cores_8": 31, "run_20210712221415_instance_3_cores_12": 31, "run_20210712221415_instance_4_cores_16": 31, "run_20210712221415_instance_5_cores_20": 31, "run_20210712221415_instance_6_cores_24": 31, "run_20210712221415_instance_7_cores_28": 31, "run_20210712221415_instance_8_cores_32": 31, "run_20210712221415_instance_9_cores_36": 31, "run_20210712221615_inst": 31, "run_20210712221615_instance_0_cores_11": 31, "run_20210712223308_inst": 31, "run_20210712223308_instance_0_cores_0": 31, "run_20210713152500_instance_0_cores_0": 31, "run_20210713153048_instance_0_cores_0": 31, "run_20210713153333_instance_0_cores_0": 31, "run_20210713153659_instance_0_cores_0": 31, "run_20220106130151_instance_0_cores_0": 31, "run_benchmark": [26, 34], "run_qa": [10, 34], "runner": 5, "running_mod": 34, "runtim": [1, 8, 13, 17, 31, 33, 34], "runtimeerror": [26, 34], "s1": 20, "s7": 34, "s8": 34, "sacrif": 8, "sai": 5, "salesforc": 28, "same": [2, 5, 7, 10, 15, 16, 17, 18, 20, 21, 28, 31, 32, 33, 34], "same_model_execution_again": 34, "sampl": [2, 6, 9, 14, 16, 17, 29, 33], "sample_input": [2, 9, 34], "sample_text_captum_input": 32, "sampler": 6, "sampling_s": [2, 4, 16, 34], "sapphir": 3, "satisfi": [15, 26], "satur": 34, "save": [2, 5, 6, 7, 13, 14, 15, 16, 18, 21, 28, 32, 34], "save_qconf_summari": [6, 15, 16, 29], "scalabl": [3, 7, 21, 28, 30, 33, 34], "scalar": 2, "scalartyp": 17, "scalartypetocpptyp": 17, "scale": [2, 3, 6, 15, 28], "scale_attn": 2, "scaled_dot_product_attent": 2, "scatter": 31, "scenario": [2, 6, 7, 18, 33, 34], "schedul": [1, 2, 13, 20, 31, 33], "scheme": 32, "scope": [2, 7, 8, 21, 34], "script": [1, 2, 3, 4, 5, 6, 7, 8, 10, 14, 17, 20, 23, 24, 26, 28, 29, 30, 32, 33, 34], "scriptmodul": [2, 13, 20], "sdk": 34, "search": [1, 2, 4, 5, 7, 16, 22, 28, 31], "sec": 30, "second": [2, 10, 28, 32, 33], "secondli": 28, "secret": 18, "section": [1, 6, 7, 8, 14, 20, 23, 24, 25, 28, 29, 32, 33, 34], "secur": 3, "see": [1, 2, 5, 8, 14, 34], "seed": 2, "seen": 28, "select": [2, 5, 7, 13, 24, 34], "self": [2, 6, 8, 10, 16, 20, 26, 34], "selu": 34, "semant": 18, "sens": 21, "sep": [3, 17], "separ": [7, 19, 27, 33], "seq_classification_artifact": 32, "seq_info": 2, "seq_len": [2, 30], "seq_length": [6, 11, 32], "seqlen_k": 2, "seqlen_q": 2, "sequenc": [2, 18, 21, 28, 34], "sequenti": 16, "seri": 33, "serv": [20, 34], "server": [32, 33], "servic": [6, 28, 30, 33], "session": 30, "set": [1, 2, 4, 5, 6, 7, 8, 14, 15, 16, 17, 21, 24, 26, 28, 30, 31, 32, 33, 34], "set_flush_denorm": 33, "set_format": 6, "set_glob": 6, "set_num_thread": [26, 34], "set_properti": 6, "sete": 15, "settensorexprfuseren": 26, "setup": [5, 6, 28, 34], "setup_config": 32, "setup_lint": 5, "sever": [2, 7, 10, 19, 30, 31, 34], "sgd": [2, 6, 7, 8, 16, 19], "sgemm": 34, "sha": 17, "shall": [5, 18, 33], "shape": [2, 6, 7, 16, 20, 23, 30, 33, 34], "shard": 28, "share": [1, 5, 6, 16, 20, 32, 33, 34], "share_weight_observ": 2, "shared_criterion": [16, 22], "sheet": 23, "shift": 21, "ship": 28, "short_factor": 2, "shortcut": 34, "shorten": 5, "shorter": [21, 28], "should": [2, 5, 8, 15, 20, 28, 31, 33], "show": [8, 17, 21, 28, 29, 30, 31, 32, 33, 34], "shown": [1, 6, 18, 28, 31, 32], "shuffl": 6, "shufflenet": 30, "shufflenetv2_x1": 30, "side": [15, 33], "sigmoid": [13, 34], "sign": 21, "signficantli": 32, "signifi": 28, "signific": 21, "significantli": [28, 34], "silu": [2, 13], "similar": [15, 17, 33], "similarli": 32, "simpl": [5, 7, 8, 11, 18, 33, 34], "simplenet": [8, 34], "simpli": [6, 7, 26, 31], "simplifi": [10, 34], "simultan": 20, "sin": 2, "sinc": [6, 7, 18, 19, 20, 21, 26, 33, 34], "sincer": 34, "singl": [2, 7, 13, 14, 16, 19, 20, 30, 32, 34], "single_query_cached_kv_attent": 2, "site": 32, "situat": [7, 14], "six": 33, "size": [2, 6, 7, 11, 15, 16, 17, 18, 23, 26, 28, 30, 32, 33, 34], "sizeof": 17, "skip": [5, 6, 17, 18], "skip_special_token": [6, 23], "skylak": 15, "sleef": 17, "sleep": 33, "slice": [6, 18], "sliu": 34, "slope": 2, "slot": [2, 30], "slot_map": 2, "slow": 34, "slower": [8, 33, 34], "small": [7, 19, 33, 34], "smaller": [8, 17], "smooth": 7, "smooth_l1_loss": 8, "smoothquant": [2, 6, 7, 16, 22, 28, 34], "smoothquant_arg": [2, 16], "snippet": [10, 29], "so": [2, 5, 6, 7, 8, 15, 17, 18, 20, 30, 31, 32, 33, 34], "sock": 32, "socket": [14, 30, 32, 33, 34], "soft_margin_loss": 8, "softmax": [2, 13, 34], "softmax_scal": 2, "softwar": [3, 27, 34], "sole": 33, "solut": [2, 7, 26, 28, 34], "solv": [7, 19, 33], "some": [2, 5, 7, 8, 13, 16, 17, 18, 20, 26, 28, 31, 32, 33, 34], "someth": 18, "sometim": [31, 33], "sophist": 33, "sourc": [1, 5, 6, 17, 27, 28, 33, 34], "space": [2, 7, 16, 18, 22, 33], "spars": [7, 18, 34], "sparsiti": 2, "spawn": [7, 20], "special": [17, 18, 28], "specif": [1, 2, 5, 6, 7, 12, 18, 20, 26, 28, 31, 33, 34], "specifi": [2, 5, 6, 14, 20, 31, 33, 34], "specifii": 17, "spectr": 30, "speech": [3, 33], "speed": [2, 7, 11, 19, 28, 33, 34], "speedup": [2, 6, 8, 28, 30, 34], "sphinx": 5, "split": [2, 6, 7, 16, 17, 19, 20, 26, 34], "split_bf16_from_fp32": 21, "split_master_weight_for_bf16": 2, "splitsgd": [7, 21], "spontan": 18, "sqrt": [2, 13, 19], "squad": [10, 30, 34], "squar": [13, 28], "squenc": 2, "src": [2, 17], "src_data_ptr": 18, "src_md": 18, "src_mem": 18, "ssd": [30, 34], "sse": 17, "sse2": 17, "sse3": 17, "sse4_1": 17, "sse4_2": 17, "ssse3": 17, "stabil": [2, 8, 34], "stabilityai": 28, "stabl": [2, 3, 8, 34], "stablelm": [2, 28], "stack": [6, 8], "stage": [7, 10, 19, 20, 29, 33, 34], "stakehold": 34, "stall": 33, "standard": [1, 34], "stanford": 34, "starcod": [28, 34], "start": [1, 3, 4, 5, 6, 7, 10, 20, 24, 34], "start_dim": 20, "state": [2, 15, 19, 28], "state_dict": [2, 6, 34], "state_sum": 19, "state_sum_i": 19, "state_sum_n": 19, "statement": [14, 17], "static": [2, 4, 16, 26, 28, 31, 32, 33, 34], "static_quantized_model": 6, "staticquantizationint8": 28, "statist": 7, "statu": 17, "std": [6, 17, 19], "stdio": 5, "stdout": 31, "stead": 17, "steam": [20, 34], "step": [2, 5, 6, 7, 8, 14, 16, 19, 21, 32], "step_siz": [16, 22], "stft": 8, "stick": 7, "still": [2, 5, 7, 8, 13, 16, 18, 21, 26, 34], "stock": [13, 30, 34], "stop": [2, 33], "storag": 19, "store": [2, 17, 18, 19, 21, 28, 31, 32, 33, 34], "store_tru": [6, 23], "str": [2, 6, 14, 23, 31], "straight": [13, 33], "straightforward": 34, "strategi": [14, 31, 33, 34], "stream": [2, 7, 20, 34], "streamlin": 34, "strict": [6, 32], "stride": [8, 10, 20, 34], "stride_c": 18, "stride_h": 18, "stride_n": 18, "stride_w": 18, "string": [2, 31], "structur": [1, 18, 31, 34], "style": [2, 5], "sub": [20, 28, 33], "subfold": 17, "subgraph": 2, "subject": [7, 17, 20, 27, 34], "submit": [1, 5, 7, 20], "submodul": 5, "subsequ": [18, 33], "substr": 5, "success": [10, 24], "suffer": 20, "suffix": 17, "suggest": [1, 2, 15, 18, 20, 33, 34], "suit": 5, "sum": [13, 16, 18, 19, 34], "summar": 26, "summari": [6, 34], "super": [8, 10, 16, 20, 26, 34], "superset": 20, "suppli": 8, "support": [2, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 25, 26, 28, 29, 31, 32, 33, 34], "suppos": [2, 6, 14, 33], "sure": [5, 14, 15, 32, 33], "svd": 8, "sw": 30, "swish": 34, "switch": [7, 17, 31, 33, 34], "sy": 30, "sycl": 1, "symbol": 20, "symeig": 8, "symlink": 5, "symmetr": [2, 15], "sync": [5, 20], "synchron": [20, 26, 34], "sysctl": 33, "system": [17, 33], "systemat": 7, "t": [2, 5, 7, 8, 14, 15, 16, 17, 18, 20, 26, 32, 34], "t5": [2, 26, 28, 34], "t_valu": 17, "tab": 5, "tabl": [2, 7, 17, 28, 30, 34], "tackl": 7, "tacotron2": 34, "take": [1, 2, 7, 8, 10, 12, 13, 14, 18, 21, 25, 26, 30, 31, 33], "taken": 32, "tanh": [13, 34], "target": [5, 6, 10, 13, 14, 17, 34], "target_link_librari": 6, "target_v": 14, "task": [2, 7, 28, 31, 33, 34], "task1": 20, "task2": 20, "taskset": 31, "tbd": 26, "tc": 14, "tcmalloc": 32, "te": 34, "team": [1, 5], "techniqu": [1, 2, 7, 11, 12, 28, 34], "technolog": [1, 7, 28], "technologi": [3, 7], "tee": 31, "tell": [18, 20, 31, 33], "temperatur": [6, 23], "tenosr": 2, "tensor": [2, 6, 7, 8, 11, 15, 16, 17, 20, 26, 28, 32, 34], "tensorexpr_fus": 26, "tensorflow": 18, "tensoriter": 18, "terabyt": 30, "term": 27, "termin": 14, "test": [7, 16, 17, 30, 34], "test_": 5, "test_alias_analysi": 5, "test_bceloss": 5, "test_data": 16, "test_dataload": 16, "test_jit": 5, "test_mseloss": 5, "test_nn": 5, "test_sequenti": 5, "testclassnam": 5, "testjit": 5, "testnam": 5, "testnn": 5, "testsuit": 5, "text": [3, 6, 26, 28, 30, 33], "text_max_length": 2, "tgi": 34, "than": [2, 5, 7, 17, 18, 20, 21, 26, 31, 33, 34], "thank": [5, 34], "thei": [7, 8, 31, 33], "them": [1, 5, 7, 18, 19, 28, 31, 33], "themselv": [31, 34], "therefor": 33, "thi": [2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31, 34], "thing": [14, 33], "third": [19, 34], "those": [2, 15, 33], "though": [2, 7], "thrash": 33, "threa": 34, "thread": [1, 2, 7, 20, 26, 30, 31, 32, 33, 34], "three": [7, 16, 17], "threshold": 33, "through": [1, 2, 6, 7, 8, 12, 25, 28, 33, 34], "throughput": [2, 3, 18, 20, 26, 28, 30, 34], "thu": [2, 7, 8, 10, 18, 20, 21, 28, 31, 32, 33], "thudm": 28, "tidi": 5, "tightli": 34, "tiiuae": 28, "tile": 17, "time": [2, 5, 7, 14, 16, 17, 18, 19, 26, 28, 30, 33, 34], "timeout": [2, 5, 21], "timestamp": [2, 28], "tip": 17, "tmp": [10, 32, 34], "to_bfloat16_train": 7, "to_dens": 18, "to_mkldnn": 18, "togeth": [7, 14, 20, 33, 34], "toggl": 7, "token": [2, 6, 23, 28, 30], "tokenize_funct": 6, "tolist": 16, "tool": [17, 33, 34], "toolset": 17, "top": [10, 21, 34], "top1": 30, "toplevel": 5, "topologi": [7, 18, 19, 26, 30, 31, 33, 34], "torch": [1, 2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 18, 20, 23, 26, 29, 32, 33, 34], "torch_ccl": 6, "torch_check": 17, "torch_dtyp": [6, 23], "torch_ipex": [17, 34], "torch_ipex_librari": 6, "torchconfig": 6, "torchdynamo": [1, 7, 12, 23, 34], "torchrun": 34, "torchscirpt": 2, "torchscript": [1, 2, 5, 7, 10, 11, 12, 19, 23, 26, 32, 34], "torchserv": [3, 34], "torchvis": [6, 10, 12, 13, 16, 18, 32, 34], "torchvison": 34, "total": [2, 6, 30, 33], "total_new_token": [6, 23], "totensor": [6, 13, 16], "tpp": 2, "trace": [1, 6, 7, 8, 12, 13, 15, 16, 20, 23, 26, 32, 34], "trace_model": 34, "traced_model": [6, 10, 13, 15, 16, 26, 34], "traced_model1": 20, "traced_model2": 20, "track": 1, "track_running_stat": 10, "trade": [8, 28, 30, 34], "tradeoff": 15, "trail": [5, 21], "train": [2, 3, 4, 7, 11, 13, 15, 16, 18, 21, 23, 26, 28, 31, 34], "train_dataload": 16, "train_dataset": [6, 13], "train_load": [6, 8], "training_data": 16, "transfer": 33, "transform": [2, 3, 4, 6, 10, 11, 13, 16, 18, 22, 23, 28, 29, 32, 33, 34], "transformer_handler_gener": 32, "transformerencoderlay": 26, "transnetv2": 34, "transpar": [2, 7, 29, 33, 34], "transpos": [13, 34], "tree": [5, 6], "tri": 12, "trial": 14, "triangular_solv": 8, "trigger": 12, "triplet_margin_loss": 8, "true": [2, 4, 6, 10, 12, 13, 14, 15, 16, 17, 22, 23, 31, 32, 33, 34], "trust_remote_cod": [6, 23], "truth": 21, "try": [2, 5, 6, 7, 12, 14, 16, 26, 31, 33, 34], "tunabl": [30, 32], "tune": [2, 3, 4, 7, 8, 15, 20, 26, 28, 29, 31, 32, 34], "tuned_conf": 16, "tuned_model": [4, 16, 34], "tunin": 32, "tuning_tim": [2, 4, 16, 34], "tupl": [2, 6, 17, 20], "turboboost": 30, "turn": [7, 34], "tutori": [5, 6, 15, 16, 29, 34], "two": [2, 7, 14, 16, 20, 21, 28, 32, 33, 34], "txt": [5, 6, 32], "type": [2, 4, 5, 6, 7, 10, 16, 17, 18, 20, 21, 23, 30, 31, 32, 34], "types": 17, "typic": [6, 10, 28, 33, 34], "u": [30, 32], "u7": 34, "u8": 34, "ubuntu": 30, "ucod": 30, "uint32_t": 17, "ultra": 33, "uma": 33, "unabl": 10, "unalign": [17, 34], "uncas": [4, 6, 10, 11, 32, 34], "undefin": [2, 20, 33], "under": [2, 6, 8, 18, 20, 27, 31, 34], "undergo": 26, "underhood": 34, "underli": [1, 17, 28], "underneath": 34, "understand": [21, 28, 33], "undesir": 31, "unexpect": 2, "unifi": [2, 31], "uniform": 32, "uninstal": 5, "union": 2, "unit": [1, 2, 33], "unittest": 5, "unix": 32, "unlik": 6, "unlist": 8, "unnecessari": 33, "unpack": [26, 34], "unpad": 2, "unpredict": 2, "unrel": 6, "unsign": 34, "unsqueez": 2, "unstabl": 8, "until": [5, 20, 21, 33], "untrack": 5, "unus": [31, 33], "unutil": 32, "up": [2, 3, 7, 11, 20, 24, 28, 33, 34], "updat": [2, 5, 7, 16, 19, 21, 22, 34], "upgrad": 34, "upi": 33, "upload": 34, "upper": [18, 33], "upsampl": [18, 34], "upstream": [7, 18, 34], "url": [32, 34], "us": [1, 2, 3, 4, 5, 6, 11, 14, 15, 17, 18, 19, 21, 23, 24, 25, 26, 27, 28, 32, 33, 34], "usabl": 34, "usag": [2, 6, 7, 8, 23, 25, 32, 33, 34], "use_all_nod": 14, "use_default_alloc": [32, 34], "use_logical_cor": [14, 32], "user": [1, 2, 7, 9, 10, 12, 13, 15, 16, 18, 20, 26, 31, 32, 33, 34], "user_model": [6, 15], "usr": [6, 17, 31, 32], "usual": [2, 18, 20, 33], "usuali": 33, "usus": 32, "ut": 31, "util": [1, 6, 7, 10, 13, 15, 16, 18, 21, 28, 31, 33, 34], "ux": 34, "v": 5, "v0": [28, 34], "v1": [28, 34], "v2": [28, 30, 34], "v3": 34, "valid": [2, 21, 34], "valu": [2, 6, 10, 14, 16, 17, 19, 20, 21, 22, 26, 28, 31, 32, 33, 34], "value_cach": 2, "value_token": 2, "var": 29, "vari": 16, "variabl": [2, 5, 17, 30, 31, 32, 33, 34], "varianc": 34, "variance_epsilon": 2, "variant": [2, 8, 28, 34], "variou": [6, 7, 14, 28, 33, 34], "varlen_attent": [2, 34], "varlenattent": [2, 34], "varlenattention_modul": 2, "ve": 34, "vec256": 17, "vec512": 17, "vec_bia": 17, "vector": [1, 2, 6, 17, 18, 25, 28], "vectors": 17, "verbos": [2, 4, 31], "verbose_off": 2, "verbose_on": 2, "verbose_on_cr": 2, "veri": [2, 5, 15, 18, 28], "verifi": [6, 7], "version": [6, 7, 16, 17, 25, 26, 27, 32, 33, 34], "vgg": 30, "vgg11": 30, "via": [2, 5, 6, 7, 18, 20, 30, 31, 33, 34], "video": 7, "view": [13, 18, 20, 21], "view_as_complex": 8, "virtual": 17, "virtual_env": [31, 32], "vision": [3, 6, 30], "visit": [7, 33], "vllm": [2, 34], "vm": 34, "vnni": [1, 15, 17, 25, 28], "vocab_s": [6, 11, 32], "voic": 33, "void": 17, "vstack": 6, "w": [7, 16, 18, 21, 30, 32], "wa": [7, 31, 32, 33, 34], "wai": [5, 10, 16, 18, 28, 34], "wait": [20, 33], "wake": 20, "walk": 34, "want": [2, 5, 7, 14, 15, 17, 20, 31, 34], "warm": 33, "warn": [5, 6, 12, 31, 32, 34], "wav2vec2": 33, "wave2vec": 34, "wc": 18, "we": [1, 2, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 23, 28, 30, 32, 33, 34], "web": 28, "webpag": 34, "websit": 7, "wei_ic_observ": 2, "wei_observ": 2, "weight": [1, 2, 7, 10, 12, 13, 15, 16, 18, 20, 22, 23, 26, 28, 34], "weight_dacai": 21, "weight_decai": [7, 19], "weight_dtyp": [2, 6, 29], "weight_qschem": 2, "weights_prepack": [2, 6, 7, 23, 26], "well": [1, 2, 5, 6, 7, 11, 16, 20, 21, 24, 28, 32, 33, 34], "were": [30, 31, 32, 33], "west": 30, "what": [3, 5, 6, 8, 23], "wheel": 34, "when": [2, 5, 6, 7, 8, 9, 14, 18, 19, 20, 21, 22, 25, 26, 28, 30, 31, 32, 33, 34], "where": [2, 5, 7, 16, 21, 33, 34], "wherea": 30, "whether": [2, 6, 8, 16, 18, 22, 23, 33], "which": [1, 2, 5, 7, 8, 10, 14, 15, 16, 17, 18, 20, 26, 28, 30, 31, 32, 33, 34], "while": [2, 7, 8, 11, 12, 18, 21, 26, 28, 31, 32, 33, 34], "whisper": [2, 28, 34], "whl": 34, "who": 10, "whole": [19, 20, 33], "wide": [21, 34], "wider": 1, "widespread": [1, 7, 28], "width": [17, 18], "wikipedia": [13, 33], "wise": [2, 16, 19, 22, 29, 34], "wish": [5, 7], "with_arg": [2, 6, 15], "within": [5, 16, 21, 29, 33, 34], "without": [2, 5, 6, 7, 8, 10, 16, 20, 21, 26, 32, 34], "wlydcrb1": 30, "wn": 18, "won": [2, 7, 8, 17, 26], "woq": [2, 28], "woqactquantmod": 2, "woqlowpmod": [2, 6, 29], "woqweightdtyp": [2, 6, 29], "woqweightqschem": 2, "work": [2, 5, 6, 7, 14, 15, 17, 20, 26, 28, 29, 31, 33, 34], "workabl": 2, "workaround": [26, 34], "worker": [20, 31], "workflow": 34, "workload": [1, 6, 7, 8, 10, 11, 12, 21, 26, 28, 29, 30, 31, 33, 34], "workload1": 30, "workspac": 6, "world": [5, 7], "world_siz": [6, 29], "worri": 32, "wors": 2, "worth": 34, "would": [2, 5, 6, 14, 16, 17, 18, 30, 31, 32, 33, 34], "wrap": 34, "write": [7, 17], "written": [5, 6, 17], "x": [1, 2, 5, 6, 8, 10, 13, 15, 16, 17, 18, 20, 21, 23, 26, 34], "x1": 20, "x2": 20, "x86": 3, "x86_64": 30, "xcr0": 17, "xdf": 5, "xe": 33, "xeon": [3, 7, 14, 21, 28, 30, 32, 33, 34], "xl": 28, "xlm": 26, "xmx": 1, "xpu": [1, 2, 3, 34], "xsave": 17, "xx": 6, "xx_c": 34, "xx_v": 34, "y": [8, 15, 16, 20, 21, 34], "y1": 20, "y1_futur": 20, "y2": 20, "y2_futur": 20, "y_runtim": 20, "yaml": 14, "ye": 5, "year": 28, "yet": [2, 6, 26, 34], "yield": [1, 7, 33], "yolov3": 34, "you": [1, 2, 5, 6, 7, 8, 13, 14, 15, 17, 18, 20, 23, 25, 26, 28, 29, 31, 33, 34], "your": [1, 5, 6, 7, 8, 10, 14, 15, 20, 23, 24, 26, 27, 28, 29, 34], "your_calibration_dataset": 29, "your_conf_fil": [4, 34], "your_generation_param": 34, "your_python_script": [4, 34], "your_pytorch_script": [4, 31], "yuan": [2, 28], "yuan2": 28, "z11pa": 33, "zero": [6, 15, 34], "zero_grad": [6, 7, 16], "zero_tensor": 2, "zip": [6, 23, 34], "zone": [30, 34], "zoo": [6, 30], "\u03b1": 21}, "titles": ["Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc", "Intel\u00ae Extension for PyTorch*", "API Documentation", "Blogs & Publications", "Cheat Sheet", "Contribution", "Examples", "Features", "Auto Mixed Precision (AMP)", "Auto Channels Last", "Codeless Optimization (Prototype)", "Fast BERT (Prototype)", "Graph Capture (Prototype)", "Graph Optimization", "HyperTune (Prototype)", "Intel\u00ae Extension for PyTorch* optimizations for quantization", "INT8 Recipe Tuning API (Prototype)", "ISA Dynamic Dispatching", "Channels Last", "Optimizer Fusion", "Runtime Extension", "Split SGD", "Smooth Quant Recipe Tuning API (Prototype)", "Quick Start", "Installation", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimization Overview", "LLM Optimizations Frontend API", "Performance", "Launch Script Usage Guide", "TorchServe with Intel\u00ae Extension for PyTorch*", "Performance Tuning Guide", "Releases"], "titleterms": {"": 34, "0": [6, 7, 34], "1": [7, 14, 32, 34], "10": [30, 34], "100": 34, "11": [30, 34], "12": 34, "13": [7, 34], "2": [6, 7, 14, 32, 34], "200": [30, 34], "2xlarg": 30, "3": [32, 34], "300": 34, "4": [32, 34], "8": 34, "9": 34, "That": 18, "The": 10, "__call__": 10, "access": [28, 33], "accuraci": 30, "add": 17, "ai": [6, 30], "algorithm": 16, "all": [18, 31], "alloc": [31, 33], "alpha": [16, 34], "alreadi": 10, "amp": [7, 8], "an": 30, "api": [2, 7, 9, 13, 16, 17, 18, 22, 25, 28, 29], "appli": 10, "architectur": 1, "archiv": 32, "asynchron": 20, "aten": [17, 18], "attr": 10, "auto": [7, 8, 9, 16, 20], "autocast": 8, "autotun": 16, "aw": 30, "b": 18, "basic": 20, "behavior": 8, "benchmark": 32, "bert": [2, 6, 7, 11, 32], "beta": [6, 7], "better": 5, "bf16": [6, 10, 13, 29], "bfloat16": [6, 8, 21, 26, 30], "bind": 20, "block": 18, "blog": 3, "boost": 32, "build": [5, 17], "c": [5, 6, 18], "c6i": 30, "cach": [28, 33], "calibr": [6, 15], "can": 8, "captur": [7, 12], "case": [8, 10, 20], "center": 30, "chang": 34, "channel": [7, 9, 18, 33], "cheat": 4, "check": 17, "code": 17, "codegen": 17, "codeless": [7, 10], "command": 10, "common": 29, "compil": [7, 17], "configur": [20, 30, 33], "content": [32, 33], "contribut": 5, "convers": 18, "convert": 15, "convolut": 18, "core": [20, 31, 32], "correct": 26, "coverag": 18, "cpp": 17, "cpu": [0, 2, 17, 18, 33], "creat": [18, 32], "creation": 18, "csrc": 17, "custom": [17, 28], "d": 18, "data": [28, 30], "debug": [5, 17], "deepspe": [28, 29], "default": [8, 9, 14, 18, 31], "defin": [14, 15], "demo": 28, "denorm": 33, "deploi": [15, 32], "deploy": 6, "descent": 21, "descript": [11, 12], "design": [0, 17, 20, 31], "detail": 20, "determin": 16, "develop": 5, "disabl": 9, "dispatch": [0, 7, 17], "dispatchstub": 17, "distribut": [6, 28, 29], "do": 15, "doc": 0, "document": [2, 5, 25, 32, 33], "dure": 20, "dynam": [0, 6, 7, 15, 17, 26], "dyndisp": 17, "eager": [6, 8], "eas": [9, 13], "easi": 7, "ec2": 30, "elig": 8, "enabl": 9, "exampl": [6, 10, 11, 12, 14, 16, 17, 20, 31], "examples1": 20, "examples2": 20, "examples3": 20, "explicitli": 10, "export": 32, "extens": [0, 1, 5, 7, 15, 20, 26, 32], "fast": [2, 6, 7, 11], "featur": [6, 7, 11, 12, 17], "file": 32, "fix": 16, "float32": [6, 8], "fold": 13, "folder": 17, "format": 18, "forward": 10, "fp32": [6, 10, 13, 29, 30], "from": [6, 7], "frontend": 29, "fusion": [13, 19], "gener": [2, 26], "get": 25, "gnu": [31, 33], "gradient": 21, "graph": [2, 7, 12, 13, 28], "guid": [31, 33], "h": 17, "hardwar": [30, 33], "highlight": 34, "how": 20, "huggingfac": 10, "hyperparamet": 14, "hypertun": [7, 14], "i": [18, 20, 31], "ii": 31, "iii": 31, "implement": [17, 20], "improv": 34, "includ": 31, "index": 31, "indirect": 28, "infer": [6, 8, 28, 29, 31, 32], "input": [8, 20], "instal": [24, 32], "instanc": [28, 30, 31], "instead": 10, "int4": 6, "int8": [6, 7, 13, 16, 26, 30, 32], "intel": [0, 1, 5, 6, 15, 30, 31, 32, 33], "intrin": 17, "introduct": [8, 19, 25], "iomp": 20, "ipex": [10, 28], "isa": [0, 7, 17], "issu": [9, 20, 34], "iv": 31, "jemalloc": [31, 33], "jit": 10, "kernel": [17, 18], "known": [9, 20, 34], "kv": 28, "languag": [6, 7, 28], "larg": [6, 7, 28], "last": [7, 9, 18, 33], "latenc": 31, "launch": [10, 31], "launcher": [14, 32], "layout": 18, "level": [2, 17, 28], "librari": 31, "licens": 27, "linear": 28, "lint": 5, "list": 28, "llm": [2, 6, 7, 23, 28, 29, 30], "load": 20, "local": 5, "logic": 31, "low": 28, "manner": 18, "manual": 17, "matter": 18, "memori": [18, 31, 33], "method": 10, "methodologi": [13, 28], "mix": [7, 8], "mode": [6, 28, 31], "model": [6, 7, 13, 15, 18, 20, 28, 32], "modul": [2, 10, 20, 28], "motiv": 10, "multi": 32, "multipl": 31, "multistream": 20, "nativ": 18, "nchw": 18, "nchw16c": 18, "new": [6, 7, 34], "nhwc": 18, "node": 31, "non": 33, "note": 34, "numa": 33, "numactl": 33, "number": [30, 31, 33], "omp_num_thread": 33, "omp_thread_limit": 33, "onednn": [18, 33], "onli": [6, 29], "op": 8, "openmp": [31, 33], "oper": [7, 18, 19, 28], "optim": [2, 7, 10, 13, 15, 19, 28, 29], "origin": 10, "other": 34, "output": 20, "overview": [17, 28, 30, 31, 33], "path": 8, "pattern": 13, "perform": [20, 26, 30, 32, 33, 34], "physic": 31, "pin": 32, "precis": [7, 8, 28], "preload": 20, "prepar": 15, "prerequisit": 11, "primit": [18, 33], "privat": 17, "process": 17, "product": 30, "promot": 8, "prototyp": [2, 6, 7, 10, 11, 12, 14, 16, 22, 28], "pseudocod": 29, "public": 3, "pytest": 5, "python": [5, 6, 7], "pytorch": [0, 1, 5, 15, 18, 32], "qconfig": 15, "quant": 22, "quantiz": [2, 6, 7, 15, 16, 29], "quick": 23, "recip": [16, 20, 22], "refer": [6, 8], "regist": [18, 32], "regress": 26, "releas": 34, "requir": [17, 20], "resnet50": [6, 32], "result": [26, 34], "runtim": [2, 7, 20, 26], "scale": 32, "scenario": 29, "script": 31, "search": 14, "select": 17, "serial": 32, "serv": 32, "set": 20, "sgd": 21, "shape": 26, "sheet": 4, "singl": [28, 31], "smooth": [6, 16, 22], "smoothquant": 29, "softwar": [30, 33], "space": 14, "specif": [8, 17], "split": 21, "start": [23, 25, 32], "static": [6, 15], "statu": 18, "stochast": 21, "stride": 18, "struct": 17, "structur": [20, 33], "stub": 17, "support": [1, 8, 10], "target": 18, "task": 20, "tcmalloc": [31, 33], "tensor": 18, "test": 5, "thi": [32, 33], "through": 16, "throughput": 31, "tip": 5, "torch": 7, "torchdynamo": [6, 26], "torchscript": [6, 8], "torchserv": 32, "trace": 10, "train": [6, 8], "troubleshoot": 26, "tune": [14, 16, 22, 33], "type": [8, 28], "uniform": 33, "unit": 5, "us": [7, 8, 9, 10, 13, 16, 20, 31], "usag": [10, 11, 12, 14, 16, 20, 26, 29, 31], "user": 14, "v": 31, "v1": 30, "vec": 17, "verifi": 28, "version": 30, "vi": 31, "via": 28, "vii": 31, "viii": 31, "weight": [6, 29], "what": [18, 34], "widest": 8, "wip": 18, "woq": 29, "worker": 32, "write": [5, 18], "xyz": 17, "xyzkrnl": 17, "your": 31, "your_conf_fil": 14, "your_python_script": 14}}) \ No newline at end of file +Search.setIndex({"alltitles": {"$\\alpha$ Usage": [[16, "alpha-usage"]], "1. Creating a serialized file": [[32, "creating-a-serialized-file"]], "1. Defining hyperparameters to tune:": [[14, "defining-hyperparameters-to-tune"]], "1.0.0-Alpha": [[34, "id47"]], "1.0.1-Alpha": [[34, "alpha"]], "1.0.2": [[34, "id46"]], "1.1.0": [[34, "id44"]], "1.10.0": [[34, "id34"]], "1.10.100": [[34, "id33"]], "1.11.0": [[34, "id31"]], "1.11.200": [[34, "id29"]], "1.12.0": [[34, "id26"]], "1.12.100": [[34, "id25"]], "1.12.300": [[34, "id23"]], "1.13.0": [[34, "id20"]], "1.13.100": [[34, "id18"]], "1.2.0": [[34, "id41"]], "1.8.0": [[34, "id39"]], "1.9.0": [[34, "id38"]], "2. Creating a Model Archive": [[32, "creating-a-model-archive"]], "2. Defining the search spaces of the hyperparameters:": [[14, "defining-the-search-spaces-of-the-hyperparameters"]], "2.0.0": [[34, "id16"]], "2.0.100": [[34, "id14"]], "2.1.0": [[34, "id12"]], "2.1.100": [[34, "id10"]], "2.2.0": [[34, "id8"]], "2.3.0": [[34, "id6"]], "2.3.100": [[34, "id4"]], "2.4.0": [[34, "id2"]], "2.5.0": [[34, "id1"]], "3. Start TorchServe to serve the model": [[32, "start-torchserve-to-serve-the-model"]], "4. Registering and Deploying model": [[32, "registering-and-deploying-model"]], "": [[14, "your-python-script"]], "API Documentation": [[2, null], [25, "api-documentation"]], "Accuracy": [[30, "accuracy"]], "Add Custom Kernel": [[17, "add-custom-kernel"]], "Algorithm: Auto-tuning of $\\alpha$.": [[16, "algorithm-auto-tuning-of-alpha"]], "Already using Jit Trace": [[10, "already-using-jit-trace"]], "Already using ipex.optimize": [[10, "already-using-ipex-optimize"]], "Architecture": [[1, "architecture"]], "Auto Channels Last": [[7, "auto-channels-last"], [9, null]], "Auto Mixed Precision (AMP)": [[7, "auto-mixed-precision-amp"], [8, null]], "Autocast Op Reference": [[8, "autocast-op-reference"]], "BERT": [[6, "bert"], [6, "id2"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [32, "bert"]], "BFloat16": [[6, "bfloat16"], [21, "bfloat16"], [26, "bfloat16"]], "Benchmarking with Launcher": [[32, "benchmarking-with-launcher"]], "Benchmarking with Launcher Core Pinning": [[32, "benchmarking-with-launcher-core-pinning"]], "Better local unit tests with pytest": [[5, "better-local-unit-tests-with-pytest"]], "Blogs & Publications": [[3, null]], "Building documentation": [[5, "building-documentation"]], "C++": [[6, "c"]], "C++ Unit Testing": [[5, "c-unit-testing"]], "CPU Channels Last Targets": [[18, "cpu-channels-last-targets"]], "CPU ISA build compiler requirement": [[17, "cpu-isa-build-compiler-requirement"]], "CPU Runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime"]], "CPU feature check": [[17, "cpu-feature-check"]], "Calibration": [[6, "calibration"]], "Channels Last": [[18, null], [33, "channels-last"]], "Cheat Sheet": [[4, null]], "Code Folder Struct": [[17, "code-folder-struct"]], "CodeGen Process": [[17, "codegen-process"]], "Codeless Optimization (Prototype)": [[10, null]], "Codeless Optimization (Prototype, NEW feature from 1.13.0)": [[7, "codeless-optimization-prototype-new-feature-from-1-13-0"]], "Command to apply ipex optimization for BF16": [[10, "command-to-apply-ipex-optimization-for-bf16"]], "Command to apply ipex optimization for FP32": [[10, "command-to-apply-ipex-optimization-for-fp32"]], "Configuration": [[30, "configuration"], [30, "id2"], [30, "id5"]], "Contents of this Document": [[32, "contents-of-this-document"], [33, "contents-of-this-document"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[5, "contributing-to-intel-extension-for-pytorch"]], "Contribution": [[5, null]], "Convert to Dynamic Quantized Model and Deploy": [[15, "convert-to-dynamic-quantized-model-and-deploy"]], "Convert to Static Quantized Model and Deploy": [[15, "convert-to-static-quantized-model-and-deploy"]], "Creating and Exporting INT8 model for Intel\u00ae Extension for PyTorch*": [[32, "creating-and-exporting-int8-model-for-intel-extension-for-pytorch"]], "Default Precision": [[8, "default-precision"]], "Default memory allocator": [[31, "default-memory-allocator"]], "Default search space": [[14, "default-search-space"]], "Define QConfig": [[15, "id1"]], "Define qconfig": [[15, "define-qconfig"]], "Defining hyperparameters and their search spaces": [[14, "defining-hyperparameters-and-their-search-spaces"]], "Demos": [[28, "demos"]], "Denormal Number": [[33, "denormal-number"]], "Deployment": [[6, "deployment"]], "Design of Task": [[20, "design-of-task"]], "Detail Design": [[20, "detail-design"]], "Determining the alpha through auto-tuning": [[16, "determining-the-alpha-through-auto-tuning"]], "Developing Intel\u00ae Extension for PyTorch*": [[5, "developing-intel-extension-for-pytorch"]], "Dispatch Stub implementation: csrc/cpu/dyndisp/DispatchStub.cpp and csrc/cpu/dyndisp/DispatchStub.h": [[17, "dispatch-stub-implementation-csrc-cpu-dyndisp-dispatchstub-cpp-and-csrc-cpu-dyndisp-dispatchstub-h"]], "Distributed Inference": [[28, "distributed-inference"]], "Distributed Inference with DeepSpeed": [[29, "distributed-inference-with-deepspeed"]], "Distributed Training": [[6, "distributed-training"]], "Dynamic Dispatch Design": [[17, "dynamic-dispatch-design"]], "Dynamic Quantization": [[6, "dynamic-quantization"], [15, "dynamic-quantization"]], "Dynamic Shape": [[26, "dynamic-shape"]], "Eager Mode": [[6, "eager-mode"], [6, "id5"]], "Ease-of-use auto channels last API": [[9, "ease-of-use-auto-channels-last-api"]], "Ease-of-use graph optimization API": [[13, "ease-of-use-graph-optimization-api"]], "Easy-to-use Python API": [[7, "easy-to-use-python-api"]], "Example Usage with HuggingFace": [[10, "example-usage-with-huggingface"]], "Example of MultiStream Module": [[20, "example-of-multistream-module"]], "Example of asynchronous task": [[20, "example-of-asynchronous-task"]], "Example of configuring core binding": [[20, "example-of-configuring-core-binding"]], "Example:": [[17, "example"], [17, "id1"]], "Examples": [[6, null]], "Examples1: Basic Usage": [[20, "examples1-basic-usage"]], "Examples2: Usage with \u201cAUTO\u201d setting": [[20, "examples2-usage-with-auto-setting"]], "Examples3: Usage for models with structure inputs/outputs": [[20, "examples3-usage-for-models-with-structure-inputs-outputs"]], "FP32 and BF16 fusion patterns": [[13, "fp32-and-bf16-fusion-patterns"]], "FP32 and BF16 models": [[13, "fp32-and-bf16-models"]], "FP32 and BFloat16 with v1.10": [[30, "fp32-and-bfloat16-with-v1-10"]], "FP32 with v1.11.200 on an AWS EC2 C6i.2xlarge instance": [[30, "fp32-with-v1-11-200-on-an-aws-ec2-c6i-2xlarge-instance"]], "FP32/BF16": [[6, "fp32-bf16"], [29, "fp32-bf16"]], "Fast BERT (Prototype)": [[11, null]], "Fast BERT Optimization (Prototype, NEW feature from 2.0.0)": [[7, "fast-bert-optimization-prototype-new-feature-from-2-0-0"]], "Fast Bert (Prototype)": [[2, "fast-bert-prototype"], [6, "fast-bert-prototype"]], "Feature Description": [[11, "feature-description"], [12, "feature-description"]], "Features": [[7, null]], "Float32": [[6, "float32"]], "Folding": [[13, "folding"]], "Fusion": [[13, "fusion"]], "GNU OpenMP": [[33, "gnu-openmp"]], "GNU OpenMP Library": [[31, "gnu-openmp-library"]], "General": [[2, "general"]], "General Usage": [[26, "general-usage"]], "Get Started": [[25, "get-started"]], "Graph Capture (Prototype)": [[12, null]], "Graph Capture (Prototype, NEW feature from 1.13.0)": [[7, "graph-capture-prototype-new-feature-from-1-13-0"]], "Graph Optimization": [[2, "graph-optimization"], [7, "graph-optimization"], [13, null], [28, "graph-optimization"]], "Hardware Configuration": [[30, "hardware-configuration"], [30, "id7"], [33, "hardware-configuration"]], "Highlights": [[34, "highlights"], [34, "id3"], [34, "id5"], [34, "id7"], [34, "id9"], [34, "id11"], [34, "id13"], [34, "id15"], [34, "id17"], [34, "id19"], [34, "id21"], [34, "id24"], [34, "id27"], [34, "id30"], [34, "id32"], [34, "id35"]], "How the core binding is implemented": [[20, "how-the-core-binding-is-implemented"]], "HyperTune (Prototype)": [[14, null]], "HyperTune (Prototype, NEW feature from 1.13.0)": [[7, "hypertune-prototype-new-feature-from-1-13-0"]], "Hyperparameters": [[14, "hyperparameters"]], "I. Use all physical cores": [[31, "i-use-all-physical-cores"]], "II. Use all cores including logical cores": [[31, "ii-use-all-cores-including-logical-cores"]], "III. Use physical cores on designated nodes": [[31, "iii-use-physical-cores-on-designated-nodes"]], "INT8": [[6, "int8"], [26, "int8"]], "INT8 Quantization": [[7, "int8-quantization"]], "INT8 Recipe Tuning API (Prototype)": [[16, null]], "INT8 fusion patterns": [[13, "int8-fusion-patterns"]], "INT8 models": [[13, "int8-models"]], "INT8 with v1.11": [[30, "int8-with-v1-11"]], "IOMP preload or load during the runtime": [[20, "iomp-preload-or-load-during-the-runtime"]], "ISA Dynamic Dispatching": [[7, "isa-dynamic-dispatching"], [17, null]], "ISA intrinics specific kernel example:": [[17, "isa-intrinics-specific-kernel-example"]], "IV. Use your designated number of cores": [[31, "iv-use-your-designated-number-of-cores"]], "Indirect Access KV Cache": [[28, "indirect-access-kv-cache"]], "Inference": [[6, "inference"]], "Inference with Eager Path": [[8, "inference-with-eager-path"]], "Inference with TorchScript Path": [[8, "inference-with-torchscript-path"]], "Install Intel\u00ae Extension for PyTorch*": [[32, "install-intel-extension-for-pytorch"]], "Installation": [[24, null]], "Intel CPU Structure": [[33, "intel-cpu-structure"]], "Intel OpenMP": [[33, "intel-openmp"]], "Intel OpenMP Library": [[31, "intel-openmp-library"]], "Intel\u00ae AI Reference Models": [[6, "intel-ai-reference-models"]], "Intel\u00ae Extension for PyTorch*": [[1, null]], "Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc": [[0, null]], "Intel\u00ae Extension for PyTorch* optimizations for quantization": [[15, null]], "Introduction": [[8, "introduction"], [19, "introduction"], [25, null]], "Jemalloc": [[31, "jemalloc"], [33, "jemalloc"]], "Kernel Stub: csrc/cpu/aten/xyz.cpp and csrc/cpu/aten/xyz.h": [[17, "kernel-stub-csrc-cpu-aten-xyz-cpp-and-csrc-cpu-aten-xyz-h"]], "Kernel implementation: csrc/cpu/aten/kernels/xyzKrnl.cpp": [[17, "kernel-implementation-csrc-cpu-aten-kernels-xyzkrnl-cpp"]], "Known Issues": [[34, "known-issues"], [34, "id22"], [34, "id28"], [34, "id36"]], "Known issue": [[9, "known-issue"], [34, "known-issue"], [34, "id49"]], "Known issues": [[20, "known-issues"], [34, "id43"]], "LLM Module Level Optimizations (Prototype)": [[2, "llm-module-level-optimizations-prototype"]], "LLM Optimizations Frontend API": [[29, null]], "LLM Performance": [[30, "llm-performance"]], "LLM Quick Start": [[23, "llm-quick-start"]], "Large Language Model (LLM)": [[6, "large-language-model-llm"]], "Large Language Models (LLM) Optimization Overview": [[28, null]], "Large Language Models (LLM, NEW feature from 2.1.0)": [[7, "large-language-models-llm-new-feature-from-2-1-0"]], "Launch Script Usage Guide": [[31, null]], "Launcher Core Pinning to Boost Performance of TorchServe Multi Worker Inference": [[32, "launcher-core-pinning-to-boost-performance-of-torchserve-multi-worker-inference"]], "Launcher Hyperparameters": [[14, "launcher-hyperparameters"]], "License": [[27, null]], "Linear Operator Optimization": [[28, "linear-operator-optimization"]], "Local linting": [[5, "local-linting"]], "Low Precision Data Types": [[28, "low-precision-data-types"]], "Memory Allocator": [[33, "memory-allocator"]], "Memory Format Is All That Matters": [[18, "memory-format-is-all-that-matters"]], "Methodology": [[13, "methodology"]], "Module Level Optimization API for customized LLM (Prototype)": [[28, "module-level-optimization-api-for-customized-llm-prototype"]], "Module uses forward method explicitly instead of the __call__ attr": [[10, "module-uses-forward-method-explicitly-instead-of-the-call-attr"]], "Motivation": [[10, "motivation"]], "Multiple instances for inference": [[31, "multiple-instances-for-inference"]], "NOTE": [[34, "note"]], "Non-Uniform Memory Access (NUMA)": [[33, "non-uniform-memory-access-numa"]], "Numactl": [[33, "numactl"]], "OMP_NUM_THREADS": [[33, "omp-num-threads"]], "OMP_THREAD_LIMIT": [[33, "omp-thread-limit"]], "OneDNN primitive cache": [[33, "onednn-primitive-cache"]], "Op Eligibility": [[8, "op-eligibility"]], "Op-Specific Behavior": [[8, "op-specific-behavior"]], "OpenMP": [[33, "openmp"]], "Operation Fusion": [[19, "operation-fusion"]], "Operator Optimization": [[7, "operator-optimization"]], "Ops that can autocast to bfloat16": [[8, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float32": [[8, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[8, "ops-that-promote-to-the-widest-input-type"]], "Optimization Methodologies": [[28, "optimization-methodologies"]], "Optimizer Fusion": [[19, null]], "Optimizer Optimization": [[7, "optimizer-optimization"]], "Others": [[34, "others"]], "Overview": [[17, "overview"], [30, "overview"], [31, "overview"], [33, "overview"]], "Performance": [[30, null], [34, "performance"]], "Performance Boost with Intel\u00ae Extension for PyTorch* and Launcher": [[32, "performance-boost-with-intel-extension-for-pytorch-and-launcher"]], "Performance Data for Intel\u00ae AI Data Center Products": [[30, "performance-data-for-intel-ai-data-center-products"]], "Performance Improvement": [[34, "performance-improvement"]], "Performance Numbers": [[30, "performance-numbers"], [30, "id1"], [30, "id4"]], "Performance Regression": [[26, "performance-regression"]], "Performance Result": [[34, "performance-result"]], "Performance Tuning Guide": [[33, null]], "Performance recipes": [[20, "performance-recipes"]], "Prepare Model": [[15, "prepare-model"]], "Prepare Model and Do Calibration": [[15, "prepare-model-and-do-calibration"]], "Prerequisite": [[11, "prerequisite"]], "Private Debug APIs": [[17, "private-debug-apis"]], "Pseudocode of Common Usage Scenarios": [[29, "pseudocode-of-common-usage-scenarios"]], "PyTorch Channels Last Memory Format APIs": [[18, "pytorch-channels-last-memory-format-apis"]], "PyTorch Strided Layout": [[18, "pytorch-strided-layout"]], "Python": [[6, "python"]], "Python Unit Testing": [[5, "python-unit-testing"]], "Quantization": [[2, "module-intel_extension_for_pytorch.quantization"]], "Quick Start": [[23, null]], "Releases": [[34, null]], "Requirements": [[20, "requirements"]], "ResNet50": [[32, "resnet50"]], "Resnet50": [[6, "resnet50"], [6, "id1"], [6, "id3"], [6, "id6"], [6, "id9"], [6, "id12"]], "Result Correctness": [[26, "result-correctness"]], "Runtime Extension": [[7, "runtime-extension"], [20, null], [26, "runtime-extension"]], "Scaling workers": [[32, "scaling-workers"]], "Select ISA level manually.": [[17, "select-isa-level-manually"]], "Serving model with Intel\u00ae Extension for PyTorch*": [[32, "serving-model-with-intel-extension-for-pytorch"]], "Single instance for inference": [[31, "single-instance-for-inference"]], "Smooth Quant Recipe Tuning API (Prototype)": [[22, null]], "Smooth Quantization Autotune": [[16, "smooth-quantization-autotune"]], "Smooth Quantization INT8": [[6, "smooth-quantization-int8"]], "SmoothQuant": [[29, "smoothquant"]], "Software Configuration": [[33, "software-configuration"]], "Software Version": [[30, "software-version"], [30, "id3"], [30, "id6"]], "Split SGD": [[21, null], [21, "id2"]], "Static Quantization": [[6, "static-quantization"], [15, "static-quantization"]], "Stochastic Gradient Descent (SGD)": [[21, "stochastic-gradient-descent-sgd"]], "Support": [[1, "support"]], "TCMalloc": [[31, "tcmalloc"], [33, "tcmalloc"]], "The origin command with ipex launch": [[10, "the-origin-command-with-ipex-launch"]], "Tips": [[5, "tips"]], "Tips and Debugging": [[5, "tips-and-debugging"]], "TorchDynamo": [[26, "torchdynamo"]], "TorchDynamo Mode (Beta, NEW feature from 2.0.0)": [[6, "torchdynamo-mode-beta-new-feature-from-2-0-0"], [6, "id11"]], "TorchScript Mode": [[6, "torchscript-mode"], [6, "id8"]], "TorchServe with Intel\u00ae Extension for PyTorch*": [[32, null]], "TorchServe with Launcher": [[32, "torchserve-with-launcher"]], "Training": [[6, "training"]], "Training Support": [[8, "training-support"]], "Troubleshooting": [[26, null]], "Unit testing": [[5, "unit-testing"]], "Usage Example": [[11, "usage-example"], [12, "usage-example"], [16, "usage-example"]], "Usage Examples": [[14, "usage-examples"], [31, "usage-examples"]], "Usage of Hypertune": [[14, "usage-of-hypertune"]], "Usage of Jemalloc/TCMalloc/Default memory allocator": [[31, "usage-of-jemalloc-tcmalloc-default-memory-allocator"]], "Usage of OpenMP library": [[31, "usage-of-openmp-library"]], "Usage of launch script": [[31, "usage-of-launch-script"]], "Use Case": [[8, "use-case"]], "Use Case not supported": [[10, "use-case-not-supported"]], "Use Cases": [[20, "use-cases"]], "User defined search space": [[14, "user-defined-search-space"]], "Using a fixed alpha": [[16, "using-a-fixed-alpha"]], "V. Throughput mode": [[31, "v-throughput-mode"]], "VI. Latency mode": [[31, "vi-latency-mode"]], "VII. Your designated number of instances": [[31, "vii-your-designated-number-of-instances"]], "VIII. Your designated number of instances and instance index": [[31, "viii-your-designated-number-of-instances-and-instance-index"]], "Vec specific kernel example:": [[17, "vec-specific-kernel-example"]], "Verified for distributed inference mode via DeepSpeed": [[28, "verified-for-distributed-inference-mode-via-deepspeed"]], "Verified for single instance mode": [[28, "verified-for-single-instance-mode"]], "Weight Only Quantization (WOQ)": [[29, "weight-only-quantization-woq"]], "Weight Only Quantization INT8/INT4": [[6, "weight-only-quantization-int8-int4"]], "What is Channels Last": [[18, "what-is-channels-last"]], "What\u2019s Changed": [[34, "what-s-changed"], [34, "id37"]], "What\u2019s New": [[34, "what-s-new"], [34, "id40"], [34, "id42"], [34, "id45"], [34, "id48"]], "Writing Channels Last Kernels": [[18, "writing-channels-last-kernels"]], "Writing documentation": [[5, "writing-documentation"]], "a. Create NHWC Memory": [[18, "a-create-nhwc-memory"]], "a. NCHW (default)": [[18, "a-nchw-default"]], "a. Status on CPU": [[18, "a-status-on-cpu"]], "a. tensor creation": [[18, "a-tensor-creation"]], "b. Create Convolution Primitive": [[18, "b-create-convolution-primitive"]], "b. NHWC (WIP for CPU)": [[18, "b-nhwc-wip-for-cpu"]], "b. Register Channels Last Kernel in ATen Native Manner": [[18, "b-register-channels-last-kernel-in-aten-native-manner"]], "b. tensor conversion": [[18, "b-tensor-conversion"]], "c. Blocked (nChw16c)": [[18, "c-blocked-nchw16c"]], "c. Register oneDNN Kernel on Channels Last": [[18, "c-register-onednn-kernel-on-channels-last"]], "c. model conversion": [[18, "c-model-conversion"]], "d. operator coverage": [[18, "d-operator-coverage"]], "default": [[9, "default"]], "disable": [[9, "disable"]], "enable": [[9, "enable"]], "ipex.llm Optimized Model List for Inference": [[28, "ipex-llm-optimized-model-list-for-inference"]], "oneDNN NHWC APIs": [[18, "onednn-nhwc-apis"]], "torch.compile (Beta, NEW feature from 2.0.0)": [[7, "torch-compile-beta-new-feature-from-2-0-0"]], "your_conf_file": [[14, "your-conf-file"]]}, "docnames": ["design_doc/cpu/isa_dyndisp", "index", "tutorials/api_doc", "tutorials/blogs_publications", "tutorials/cheat_sheet", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/amp", "tutorials/features/auto_channels_last", "tutorials/features/codeless_optimization", "tutorials/features/fast_bert", "tutorials/features/graph_capture", "tutorials/features/graph_optimization", "tutorials/features/hypertune", "tutorials/features/int8_overview", "tutorials/features/int8_recipe_tuning_api", "tutorials/features/isa_dynamic_dispatch", "tutorials/features/nhwc", "tutorials/features/optimizer_fusion", "tutorials/features/runtime_extension", "tutorials/features/split_sgd", "tutorials/features/sq_recipe_tuning_api", "tutorials/getting_started", "tutorials/installation", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/llm_optimize", "tutorials/performance", "tutorials/performance_tuning/launch_script", "tutorials/performance_tuning/torchserve", "tutorials/performance_tuning/tuning_guide", "tutorials/releases"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["design_doc/cpu/isa_dyndisp.md", "index.rst", "tutorials/api_doc.rst", "tutorials/blogs_publications.md", "tutorials/cheat_sheet.md", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/amp.md", "tutorials/features/auto_channels_last.md", "tutorials/features/codeless_optimization.md", "tutorials/features/fast_bert.md", "tutorials/features/graph_capture.md", "tutorials/features/graph_optimization.md", "tutorials/features/hypertune.md", "tutorials/features/int8_overview.md", "tutorials/features/int8_recipe_tuning_api.md", "tutorials/features/isa_dynamic_dispatch.md", "tutorials/features/nhwc.md", "tutorials/features/optimizer_fusion.md", "tutorials/features/runtime_extension.md", "tutorials/features/split_sgd.rst", "tutorials/features/sq_recipe_tuning_api.md", "tutorials/getting_started.md", "tutorials/installation.md", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/llm_optimize.md", "tutorials/performance.md", "tutorials/performance_tuning/launch_script.md", "tutorials/performance_tuning/torchserve.md", "tutorials/performance_tuning/tuning_guide.md", "tutorials/releases.md"], "indexentries": {"autotune() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.autotune", false]], "convert() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.convert", false]], "cpupool (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.CPUPool", false]], "enable_onednn_fusion() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.enable_onednn_fusion", false]], "fast_bert() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.fast_bert", false]], "fast_layer_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.fast_layer_norm", false]], "fastlayernorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.FastLayerNorm", false]], "frozenbatchnorm2d (class in intel_extension_for_pytorch.nn)": [[7, "intel_extension_for_pytorch.nn.FrozenBatchNorm2d", false]], "get_core_list_of_node_id() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.get_core_list_of_node_id", false]], "get_smooth_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_smooth_quant_qconfig_mapping", false]], "get_weight_only_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_weight_only_quant_qconfig_mapping", false]], "indirect_access_kv_cache_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.indirect_access_kv_cache_attention", false]], "indirectaccesskvcacheattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.IndirectAccessKVCacheAttention", false]], "intel_extension_for_pytorch": [[2, "module-intel_extension_for_pytorch", false]], "intel_extension_for_pytorch.cpu.runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime", false]], "intel_extension_for_pytorch.llm": [[2, "module-intel_extension_for_pytorch.llm", false]], "intel_extension_for_pytorch.llm.functional": [[2, "module-intel_extension_for_pytorch.llm.functional", false]], "intel_extension_for_pytorch.llm.modules": [[2, "module-intel_extension_for_pytorch.llm.modules", false]], "intel_extension_for_pytorch.quantization": [[2, "module-intel_extension_for_pytorch.quantization", false]], "interaction() (in module intel_extension_for_pytorch.nn.functional)": [[7, "intel_extension_for_pytorch.nn.functional.interaction", false]], "is_runtime_ext_enabled() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.is_runtime_ext_enabled", false]], "linear2silumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.Linear2SiluMul", false]], "linearadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAdd", false]], "linearaddadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAddAdd", false]], "lineargelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearGelu", false]], "linearmul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearMul", false]], "linearnewgelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearNewGelu", false]], "linearrelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearRelu", false]], "linearsilu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSilu", false]], "linearsilumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSiluMul", false]], "mergedembeddingbag (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBag", false]], "mergedembeddingbagwithsgd (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBagWithSGD", false]], "module": [[2, "module-intel_extension_for_pytorch", false], [2, "module-intel_extension_for_pytorch.cpu.runtime", false], [2, "module-intel_extension_for_pytorch.llm", false], [2, "module-intel_extension_for_pytorch.llm.functional", false], [2, "module-intel_extension_for_pytorch.llm.modules", false], [2, "module-intel_extension_for_pytorch.quantization", false]], "multistreammodule (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModule", false]], "multistreammodulehint (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModuleHint", false]], "optimize() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.optimize", false]], "optimize() (in module intel_extension_for_pytorch.llm)": [[2, "intel_extension_for_pytorch.llm.optimize", false]], "pagedattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.PagedAttention", false]], "pin (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.pin", false]], "prepare() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.prepare", false]], "rms_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rms_norm", false]], "rmsnorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RMSNorm", false]], "rotary_embedding() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rotary_embedding", false]], "rotaryembedding (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RotaryEmbedding", false]], "task (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.Task", false]], "varlen_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.varlen_attention", false]], "varlenattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.VarlenAttention", false]], "verbose (class in intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.verbose", false]]}, "objects": {"": [[2, 0, 0, "-", "intel_extension_for_pytorch"]], "intel_extension_for_pytorch": [[2, 2, 1, "", "enable_onednn_fusion"], [2, 2, 1, "", "fast_bert"], [2, 0, 0, "-", "llm"], [2, 2, 1, "", "optimize"], [2, 0, 0, "-", "quantization"], [2, 1, 1, "", "verbose"]], "intel_extension_for_pytorch.cpu": [[2, 0, 0, "-", "runtime"]], "intel_extension_for_pytorch.cpu.runtime": [[2, 1, 1, "", "CPUPool"], [2, 1, 1, "", "MultiStreamModule"], [2, 1, 1, "", "MultiStreamModuleHint"], [2, 1, 1, "", "Task"], [2, 2, 1, "", "get_core_list_of_node_id"], [2, 2, 1, "", "is_runtime_ext_enabled"], [2, 1, 1, "", "pin"]], "intel_extension_for_pytorch.llm": [[2, 0, 0, "-", "functional"], [2, 0, 0, "-", "modules"], [2, 2, 1, "", "optimize"]], "intel_extension_for_pytorch.llm.functional": [[2, 2, 1, "", "fast_layer_norm"], [2, 2, 1, "", "indirect_access_kv_cache_attention"], [2, 2, 1, "", "rms_norm"], [2, 2, 1, "", "rotary_embedding"], [2, 2, 1, "", "varlen_attention"]], "intel_extension_for_pytorch.llm.modules": [[2, 1, 1, "", "FastLayerNorm"], [2, 1, 1, "", "IndirectAccessKVCacheAttention"], [2, 1, 1, "", "Linear2SiluMul"], [2, 1, 1, "", "LinearAdd"], [2, 1, 1, "", "LinearAddAdd"], [2, 1, 1, "", "LinearGelu"], [2, 1, 1, "", "LinearMul"], [2, 1, 1, "", "LinearNewGelu"], [2, 1, 1, "", "LinearRelu"], [2, 1, 1, "", "LinearSilu"], [2, 1, 1, "", "LinearSiluMul"], [2, 1, 1, "", "PagedAttention"], [2, 1, 1, "", "RMSNorm"], [2, 1, 1, "", "RotaryEmbedding"], [2, 1, 1, "", "VarlenAttention"]], "intel_extension_for_pytorch.nn": [[7, 1, 1, "", "FrozenBatchNorm2d"]], "intel_extension_for_pytorch.nn.functional": [[7, 2, 1, "", "interaction"]], "intel_extension_for_pytorch.nn.modules": [[7, 1, 1, "", "MergedEmbeddingBag"], [7, 1, 1, "", "MergedEmbeddingBagWithSGD"]], "intel_extension_for_pytorch.quantization": [[2, 2, 1, "", "autotune"], [2, 2, 1, "", "convert"], [2, 2, 1, "", "get_smooth_quant_qconfig_mapping"], [2, 2, 1, "", "get_weight_only_quant_qconfig_mapping"], [2, 2, 1, "", "prepare"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:function"}, "terms": {"": [2, 3, 5, 8, 10, 14, 15, 18, 19, 20, 21, 22, 26, 31, 32, 33], "0": [1, 2, 4, 5, 8, 10, 11, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 30, 31, 32, 33], "00": [31, 34], "00000": 21, "00000000000602e7": 17, "0000012345": 21, "001": [6, 8], "0016": 30, "01": [2, 4, 7, 16, 31, 32, 34], "02": [30, 32], "02x": 30, "03": 32, "03x": 30, "04": [30, 31], "04x": 30, "05": [2, 7, 10, 30, 31], "05x": 30, "06": [2, 31, 32], "06x": 30, "07": 31, "07x": 30, "08": 31, "08x": 30, "09": [17, 31], "096": 32, "09864": 2, "09x": 30, "0x00007f3cde954000": 6, "0x00007f3ce16ac000": 6, "0x00007f3cf70fc000": 6, "0x00007f3cf985a000": 6, "0x00007f3cf98e0000": 6, "0x1": 17, "0x700001c": 30, "0x7fff": 17, "0xd0002a0": 30, "0xffff": 17, "1": [1, 2, 3, 4, 6, 8, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 33], "10": [7, 14, 16, 17, 18, 21, 25, 26, 31, 32, 33], "100": [2, 4, 14, 16, 17, 30, 32], "10000": 2, "1009": 30, "100mb": 34, "1024": [30, 33], "102b": 28, "1032": 34, "10438": 2, "1053": 34, "1074": 34, "10k": 6, "10x": 30, "11": [17, 31, 32], "111": 33, "112": [26, 30, 33, 34], "117": 31, "118": 31, "11b": [28, 34], "11x": 30, "12": [6, 10, 14, 17, 30, 31, 32], "1200": 30, "12345": 21, "1234500000": 21, "1234512345": 21, "125m": 6, "127": [6, 31, 34], "128": [6, 8, 10, 13, 20, 30, 34], "128k": [2, 28, 34], "128task": 30, "1295": 34, "12b": 28, "12x": 30, "13": [3, 10, 17, 30, 31, 32, 33], "1318": 34, "1322": 34, "1328": 34, "1330": 34, "1338": 34, "1341": 34, "1353": 34, "1355": 34, "1367": 34, "1373": 34, "1376": 34, "1384": 34, "1391": 34, "1392": 34, "13b": [28, 30, 34], "13x": 30, "14": [31, 32, 34], "140": 31, "1414": 34, "1419": 34, "143": 31, "146": 31, "1473": 34, "1488": 34, "149": 31, "14x": 30, "15": [14, 17, 30, 31, 32], "151": 31, "1513": 34, "1517": 34, "154": 31, "1563": 34, "1564": 34, "1566": 34, "1568": 34, "157": 31, "1580": 34, "1585": 34, "1587": 34, "1589": 34, "159": 31, "1590": 34, "1592": 34, "1593": 34, "1594": 34, "15x": 30, "16": [2, 17, 20, 21, 30, 31, 32], "160": 30, "162": 31, "164": 31, "1664": 34, "167": 31, "1677": 34, "1682": 34, "1688": 34, "1695": 34, "16gb": 30, "16x": 30, "16xlarg": 30, "17": [6, 30, 31, 32], "170": 30, "175": 31, "176": 31, "177": 31, "17th": 30, "18": [30, 31, 32], "18x": 30, "19": [7, 30, 31, 32, 34], "199": 30, "19x": 30, "1_6b": 28, "1b": 34, "1b7": 28, "1d": 18, "1e": [2, 7, 10, 16], "1mb": 33, "2": [1, 2, 3, 8, 10, 16, 17, 18, 20, 21, 25, 26, 27, 28, 29, 30, 31, 33], "20": [2, 7, 18, 30, 31, 32, 34], "2006080250": 30, "200m": 33, "2017": 3, "2019": 3, "2020": 3, "2021": [3, 17, 31, 32], "2022": [3, 31, 32], "2023": [2, 3, 30], "2024": 33, "2048": [2, 6], "205": 34, "20b": 28, "20x": 30, "21": [30, 31, 32], "2104": 2, "2105": 30, "2137": 34, "2195": 34, "2198": 34, "21x": 30, "22": [6, 30, 31, 32], "220m": 34, "220mb": 34, "2211": 2, "2229": 34, "223": 32, "2236": 34, "224": [6, 8, 10, 12, 13, 30, 32, 34], "224m": 34, "2251": 34, "2253": 34, "2257": 34, "2264": 34, "2275": 34, "2278": 34, "2280": 34, "2283": 34, "2290": 34, "2292": 34, "2299": 34, "23": [21, 31, 32], "2315": 34, "2317": 34, "2319": 34, "233": 31, "2334": 34, "2349": 34, "235": 31, "236": 31, "2392": 34, "24": [31, 32], "2412": 34, "2433": 34, "244": 13, "2468": 34, "2469": 34, "2473": 34, "2476": 34, "2480": 34, "2491": 34, "24x": 30, "24xlarg": 32, "25": [31, 32], "2511": 34, "2550": 34, "256": [2, 30], "2561": 34, "2568": 34, "256gb": 30, "2584": 34, "26": [30, 31, 32], "2613": 34, "2617": 34, "2627": 34, "2631": 34, "2641": 34, "2663": 34, "2666": 33, "2675": 34, "26x": 30, "27": [31, 32, 33], "2704": 34, "2733": 34, "274": 32, "2747": 34, "278": 34, "27x": 30, "28": [10, 14, 16, 30, 31, 32, 33, 34], "2883": 34, "29": [7, 31, 32], "2910": 34, "2911": 34, "2928": 34, "29500": [6, 31], "2985": 34, "2987": 34, "29x": 30, "2b": 28, "2d": 18, "2nd": 28, "2x": 34, "3": [2, 5, 6, 7, 8, 10, 12, 13, 14, 16, 17, 18, 20, 21, 28, 30, 31, 33], "30": [31, 32], "3030": 34, "305": 30, "3079": 34, "3080": 34, "30b": 28, "30ghz": 30, "30x": 30, "31": [31, 32], "3116": 34, "3143": 34, "3185": 34, "31x": 30, "32": [2, 6, 18, 21, 23, 30, 31, 32], "3200": 30, "3209": 34, "3214": 34, "3218": 34, "3246": 34, "3248": 34, "3291": 34, "32x": 30, "32x16d": 30, "33": [17, 31, 32], "3305": 34, "3307": 34, "3333": 34, "339081764221191": 14, "33x": 30, "34": [31, 32], "35": [31, 32], "355": 31, "356": 31, "35x": 30, "36": [30, 31, 32], "36x": 30, "37": [31, 32, 34], "38": [31, 32], "384": [10, 32, 34], "384task": 30, "38x": 30, "39": [30, 31, 32, 34], "39x": 30, "3b": [28, 34], "3d": 34, "3e": [10, 34], "3rd": [3, 7, 21, 30, 34], "4": [2, 6, 11, 13, 14, 18, 20, 23, 28, 30, 31, 33], "40": [30, 31, 32, 34], "407": 34, "409": 26, "4096": [2, 33], "40b": 28, "40mb": 34, "41": [31, 32], "42": [31, 32], "425": 34, "43": [6, 11, 31, 32], "432": 34, "438": 34, "44": [30, 31, 32], "44x": 30, "45": [6, 11, 31, 32], "452": 34, "45x": 30, "46": [31, 32], "47": [31, 32], "470": 31, "471": 31, "473": 31, "476": 31, "479": 31, "47x": 30, "48": [30, 31, 32], "48x": 30, "49": [30, 31, 32], "49786": 34, "4bit": 34, "4k": 28, "4th": [28, 30], "4x": 3, "5": [2, 6, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 26, 28, 30, 31, 32, 33], "50": [18, 31, 32], "50ghz": 33, "51": [31, 32], "512": [1, 6, 11, 16, 25, 28, 31], "513": 31, "52": [31, 32], "524": 34, "53": [31, 32], "531": 34, "54": [31, 32], "55": [31, 32, 33], "551": 34, "55x": 30, "56": [30, 31, 32, 33], "57": 31, "57x": 30, "58": [17, 31], "589": 34, "58x": 30, "59": 31, "591": 31, "5d": 16, "5m": 34, "5mb": 34, "5rc3": 34, "5x": 34, "6": [2, 5, 7, 11, 14, 20, 30, 31, 32, 33, 34], "60": 31, "602": 34, "61": 31, "62": 31, "62x": 30, "63": [31, 34], "64": [2, 8, 10, 16, 20, 30, 31, 34], "642": 34, "647": 34, "648": 34, "64byte": 34, "64gb": 30, "65": 31, "654": 31, "655": 31, "65536": 33, "657": 34, "66": [17, 31, 34], "67": [30, 31, 34], "674": 34, "67x": 30, "68": [31, 34], "684": 34, "685": 34, "68m": 34, "69": [30, 31], "692": 34, "6b": [2, 28, 30], "7": [10, 14, 17, 20, 21, 31, 32, 34], "70": 31, "70b": [28, 34], "71": 31, "711": 34, "71x": 30, "72": 31, "73": 31, "74": 31, "75": [30, 31], "75x": 30, "76": [30, 31], "760": [31, 32], "761": [31, 32], "762": 32, "763": 32, "764": 31, "768gb": 30, "77": 31, "77x": 30, "78": [30, 31], "784": 31, "787": 34, "78x": 30, "79": [30, 31], "7b": [6, 28, 30, 34], "7f": 16, "7m": 34, "7x": 34, "8": [14, 16, 30, 31, 32, 33], "80": [5, 30, 31], "81": [30, 31], "8180": 32, "8180m": [14, 33], "81x": 30, "82": 31, "822": 34, "83": [31, 33], "8375c": 32, "8380": 30, "8380h": 30, "83x": 30, "84": [6, 30, 31, 33], "85": [30, 31], "85x": 30, "86": [30, 31], "87": 31, "88": 31, "8b": 28, "8x": 18, "8x7b": 28, "9": [6, 7, 14, 17, 23, 25, 31, 32], "9000": 32, "9000000000": [31, 33], "9001": 32, "9002": 32, "9003": 32, "90b": 34, "90ghz": 30, "92": 30, "93": 30, "96": 30, "96x": 30, "97": 30, "975": 32, "98": 30, "981": 32, "982": 32, "99": [16, 30, 34], "992": 34, "A": [2, 5, 6, 7, 10, 11, 17, 26, 28, 31, 33, 34], "And": [15, 20, 32, 34], "As": [10, 19, 20, 28, 31, 32, 33, 34], "At": [7, 17], "But": [17, 18], "By": [17, 31, 33], "For": [1, 2, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 31, 32, 33, 34], "If": [2, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 20, 26, 31, 32, 33, 34], "In": [1, 2, 6, 7, 8, 12, 16, 17, 18, 19, 21, 23, 28, 31, 32, 33, 34], "It": [2, 6, 7, 8, 10, 13, 17, 18, 20, 21, 23, 26, 29, 31, 33, 34], "Its": 28, "NOT": [18, 31], "No": [2, 18, 34], "Not": 2, "ON": 30, "On": [1, 2, 7, 18, 28, 33], "One": [2, 3, 18, 19, 31, 33], "Such": 17, "The": [0, 1, 2, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "Then": 32, "There": [14, 16, 20, 33, 34], "These": [1, 5, 6, 7, 8, 13, 28, 34], "To": [2, 5, 6, 7, 10, 13, 15, 16, 17, 18, 20, 21, 23, 28, 32, 33, 34], "Will": [6, 18], "With": [1, 2, 7, 10, 20, 31, 34], "_": [13, 15, 16, 17, 18, 20, 30, 31, 32, 33, 34], "___": 13, "_____": 13, "__init__": [5, 6, 8, 10, 16, 20, 26, 34], "__m256i": 17, "__m512": 17, "__m512i": 17, "__main__": [26, 31, 32, 34], "__name__": [26, 34], "_appli": 18, "_build": 5, "_c": [17, 26], "_cmp_ord_q": 17, "_core": 31, "_cvt_fp32_to_bf16": 17, "_get_current_isa_level": 17, "_get_highest_binary_support_isa_level": 17, "_get_highest_cpu_support_isa_level": 17, "_jit_set_texpr_fuser_en": 26, "_lu_with_info": 8, "_mm256_mask_storeu_epi16": 17, "_mm256_storeu_si256": 17, "_mm512_add_epi32": 17, "_mm512_and_si512": 17, "_mm512_castps_si512": 17, "_mm512_cmp_ps_mask": 17, "_mm512_cvtneps_pbh": 17, "_mm512_cvtusepi32_epi16": 17, "_mm512_loadu_p": 17, "_mm512_mask_blend_epi32": 17, "_mm512_maskz_loadu_p": 17, "_mm512_set1_epi32": 17, "_mm512_srli_epi32": 17, "_native_multi_head_attent": 8, "_reorder_cach": 2, "_sym": 2, "_timestamp_inst": 31, "_timestamp_instance_": 31, "ab": [13, 32], "abi": [6, 17, 34], "abil": 16, "abl": 15, "abnorm": [26, 34], "about": [1, 2, 5, 7, 13, 16, 32, 33, 34], "abov": [2, 5, 10, 19, 28, 30, 31, 32], "absolut": [2, 31], "abstract": [2, 11, 20], "acceler": [1, 2, 3, 6, 7, 13, 28, 29, 30, 34], "accept": [2, 34], "access": [2, 6, 7, 18, 19, 32, 34], "accommod": 18, "accompani": 34, "accord": [2, 13, 28, 33, 34], "accordingli": 16, "account": 6, "accu": 16, "accumul": 2, "accur": 8, "accuraci": [2, 3, 6, 7, 8, 15, 16, 21, 22, 26, 28, 34], "accuracy_criterion": [2, 4, 16, 34], "accuracy_criterion_typ": 2, "accuracy_criterion_valu": 2, "achang": 15, "achiev": [1, 2, 6, 7, 28, 33, 34], "across": [16, 34], "act": 34, "act_ic_observ": 2, "act_observ": 2, "act_quant_mod": 2, "action": [6, 23], "activ": [2, 6, 7, 15, 16, 20, 28, 31, 33, 34], "actual": [18, 21], "acycl": 13, "ad": [2, 7, 10, 33, 34], "adagrad": [19, 21], "adagrad_fused_step": 19, "adagrad_step": 19, "adam": 34, "adapt": 7, "adaptive_avg_pool3d": 8, "adaptive_max_pool3d": 8, "adaptiveaveragepoolingkrnl": 17, "add": [2, 5, 7, 8, 13, 14, 19, 21, 32, 34], "add_": 19, "add_argu": [6, 23], "add_casual_mask": 2, "add_execut": 6, "add_help": [6, 23], "addbmm": 8, "addcdiv_": 19, "addcmul_": 19, "addit": [2, 6, 7, 17, 21, 28, 34], "addition": 32, "addlayernorm": 34, "addmm": 8, "addmm_": 8, "addr": 31, "address": [7, 18, 31, 32, 33, 34], "addtion": 17, "adjust": 16, "adopt": [28, 34], "advanc": [1, 2, 6, 7, 16, 25, 28], "advantag": [1, 2, 7, 9, 12, 18, 21, 25, 30, 31, 33], "aes_ni": 17, "affect": [2, 31], "affin": [7, 10, 15, 20, 31, 32, 33], "affinit": 32, "after": [2, 5, 7, 13, 20, 21, 23, 24, 32, 33, 34], "afterward": [31, 33], "ag": 7, "again": [5, 19, 32], "against": 6, "agre": 5, "ahead": 5, "ai": [1, 2, 3, 7, 28], "aim": [7, 10, 16, 33], "aka": [7, 18], "albert": 34, "algorithm": [2, 13, 18, 30, 34], "alia": 2, "alibi": 2, "alibi_slop": 2, "align": [17, 18, 21, 34], "aliv": 32, "all": [2, 5, 6, 8, 13, 14, 17, 19, 20, 28, 29, 32, 33, 34], "all_logical_cor": 14, "all_physical_cor": 14, "allcat": 2, "allenai": 26, "alloc": [2, 10, 20, 28, 30, 32, 34], "allow": [2, 8, 14, 16, 22, 33, 34], "allreduc": 2, "almost": 18, "along": [2, 5, 6, 21, 33, 34], "alpha": [2, 6, 19, 22], "alpha_max": [16, 22], "alpha_min": [16, 22], "alpha_step": [16, 22], "alphafold2": 34, "alreadi": [1, 5, 6, 18, 28, 33], "also": [1, 2, 6, 7, 10, 13, 14, 16, 18, 19, 28, 30, 31, 33, 34], "altern": [2, 6, 18], "although": [2, 33], "alwai": [5, 6, 7, 8, 18, 31, 33, 34], "amazon": 32, "among": [2, 31, 32, 33], "amount": [2, 16, 26, 28, 33], "amp": [4, 6, 10, 23, 26, 34], "amp_dtyp": [6, 23], "amp_en": [6, 23], "ampconf": 34, "amplifi": 1, "amx": [1, 3, 6, 7, 17, 25, 28, 30], "amx_bf16": 17, "amx_int8": 17, "amx_til": 17, "an": [1, 2, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 26, 31, 32, 33, 34], "anaconda": 17, "analysi": 33, "ani": [2, 5, 8, 10, 17, 18, 32, 34], "announc": 34, "anonym": 17, "anoth": [14, 31, 33, 34], "answer": [18, 30], "anymor": [7, 34], "anyplac": 4, "ao": [2, 6, 15], "apach": [27, 32], "api": [1, 3, 6, 10, 11, 15, 20, 26, 33, 34], "app": [6, 34], "append": [6, 7], "append_torchlib_if_found": 6, "appli": [2, 6, 7, 8, 12, 13, 16, 18, 19, 21, 23, 26, 28, 29, 31, 34], "applic": [1, 2, 7, 20, 28, 32, 33], "apply_funct": 2, "appropri": 33, "apr": 3, "ar": [1, 2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "arang": [2, 6, 16], "arbitrari": 2, "arc": 3, "architectur": [2, 28, 30, 33], "area": [7, 14], "aren": 5, "arg": [2, 4, 6, 7, 14, 16, 19, 23, 31, 32, 34], "argc": 6, "argmax": 16, "argpars": [6, 23], "argument": [2, 6, 7, 22, 26, 31], "argumentpars": [6, 23], "argv": 6, "around": 31, "arrai": 18, "articl": [30, 33], "arxiv": 2, "ask": 5, "assign": [18, 31, 32, 33], "assum": [2, 7, 8, 23, 32, 33, 34], "asu": 33, "asymmetr": 2, "async": [20, 34], "asynchron": [2, 7], "aten": [2, 6, 7, 34], "aten_cpu_cap": 17, "attach": 33, "attent": [1, 2, 7, 28, 34], "attention_mask": [2, 6], "attention_mask_pad": 6, "attn_implement": [6, 11], "attn_output": 2, "attn_weight": 2, "attribut": 18, "aug": [3, 30], "auto": [2, 6, 10, 17, 18, 22, 23, 26, 28, 31, 33, 34], "auto_alpha_arg": 16, "auto_ipex": 34, "auto_kernel_select": [2, 7, 30], "autocast": [4, 6, 7, 10, 23, 34], "autoclass": 5, "autoconfig": [6, 23], "autofunct": 5, "autom": [4, 7, 8, 14, 31, 32, 34], "automat": [1, 2, 6, 7, 9, 10, 12, 13, 15, 16, 18, 22, 28, 31, 32, 33, 34], "automaticlli": 2, "automixprecis": 34, "automodelforcausallm": [6, 23, 29, 34], "autotoken": [6, 23], "autotp": 28, "autotun": [2, 4, 22, 34], "avaiabl": 2, "avail": [1, 2, 6, 7, 11, 17, 20, 22, 23, 29, 31, 33, 34], "avg_pool3d": 8, "avoid": [2, 10, 20, 21, 26, 31, 32, 33, 34], "avx": [1, 6, 17, 25, 28], "avx2": [17, 26, 34], "avx256": 17, "avx2_vnni": 17, "avx512": [7, 17, 18, 32, 34], "avx512_4fmap": 17, "avx512_4vnniw": 17, "avx512_bf16": 17, "avx512_bitalg": 17, "avx512_bw": 17, "avx512_cd": 17, "avx512_core_vnni": 34, "avx512_dq": 17, "avx512_er": 17, "avx512_f": 17, "avx512_fp16": 17, "avx512_ifma": 17, "avx512_pf": 17, "avx512_vbmi": 17, "avx512_vbmi2": 17, "avx512_vl": 17, "avx512_vnni": 17, "avx512_vp2intersect": 17, "avx512_vpclmul": 17, "avx512_vpopcntdq": 17, "avx_vnni": 17, "awar": [18, 20, 31, 32], "awq": [2, 34], "b": [7, 8, 16, 28], "back": [6, 12, 17, 18, 21, 26], "backbon": 2, "backend": [1, 2, 3, 6, 7, 12, 13, 16, 17, 23, 26, 28, 31, 33, 34], "background": 33, "background_thread": [31, 33], "backpropag": 16, "backward": [6, 7, 8, 16, 21, 33, 34], "bactchnorm": 34, "baddbmm": 8, "bag": [26, 34], "baichuan": [2, 28, 34], "baichuan2": [28, 34], "bake": 34, "balanc": [7, 16, 22, 33], "bandwidth": [28, 34], "base": [1, 2, 3, 4, 5, 6, 7, 10, 11, 17, 20, 21, 26, 28, 29, 30, 32, 33, 34], "base_dir": 29, "base_text_classif": 30, "baselin": [16, 22, 34], "basic": [2, 4, 16, 21, 33, 34], "batch": [2, 6, 7, 13, 16, 18, 20, 23, 26, 30, 32, 34], "batch_decod": [6, 23], "batch_id": 6, "batch_idx": [6, 13], "batch_siz": [2, 6, 11, 13, 16, 18, 23, 32], "batchnorm": [13, 17, 18, 26, 34], "batchnorm2d": [7, 10, 26, 34], "batchsiz": [2, 20], "beam": [2, 28], "beam_idx": 2, "beam_idx_tmp": 6, "beam_width": 28, "becam": 34, "becaus": [8, 17, 18, 21, 28, 33, 34], "becom": [7, 28, 33], "been": [0, 1, 6, 7, 10, 17, 18, 28, 31, 33, 34], "beeter": 28, "befor": [1, 2, 5, 6, 13, 14, 17, 18, 20, 31, 33, 34], "begin": 5, "beginn": 16, "behavior": [2, 20, 31, 33], "behaviour": 10, "being": [7, 33], "believ": [8, 18], "below": [6, 8, 10, 14, 19, 20, 21, 22, 23, 26, 28, 31, 32, 33, 34], "bench": 32, "benchmark": [6, 26, 30, 31, 34], "benefici": 18, "benefit": [6, 7, 8, 10, 20, 21, 28, 32, 33, 34], "benifit": 2, "bert": [3, 4, 10, 30, 34], "bert_int8_jit": 32, "bert_ipex_int8": 32, "bertmodel": [4, 6, 11, 32], "bertmodelmodel": 4, "besid": [28, 33, 34], "best": [2, 6, 7, 8, 14, 16, 17, 22, 24, 28, 33, 34], "beta": [23, 26], "better": [1, 2, 6, 7, 15, 18, 20, 28, 31, 32, 33, 34], "between": [7, 8, 17, 20, 33, 34], "beyond": 7, "bf16": [2, 3, 7, 17, 19, 21, 23, 26, 28, 30, 34], "bf16_gw": 21, "bf16_w": 21, "bfloat16": [2, 3, 4, 7, 10, 11, 17, 18, 23, 29, 31, 34], "bfp16": 34, "bia": [2, 8, 20, 34], "big": [7, 18], "bigcod": 28, "bigscienc": 28, "bin": [5, 6, 17, 31, 32], "binari": [5, 6, 7, 8, 17, 34], "binary_cross_entropi": 8, "binary_cross_entropy_with_logit": 8, "bind": [6, 7, 31, 32, 33, 34], "bio": 30, "bit": [21, 28], "blob": 2, "block": [2, 5, 16, 20, 22, 28, 33, 34], "block_numb": 2, "block_siz": 2, "block_tabl": 2, "blocktim": 31, "blockwis": 16, "blog": [2, 34], "bloom": [2, 28], "bmm": [8, 34], "bmp": 18, "bn": [2, 10, 15, 26, 34], "bn_fold": 2, "bodi": 17, "bool": [2, 14], "boolean": [7, 34], "booltensor": 7, "boost": [3, 6, 7, 9, 21, 30, 31, 33, 34], "both": [1, 2, 6, 7, 16, 18, 19, 21, 28, 29, 31, 32, 33, 34], "bother": 16, "bottl": 19, "bottleneck": [2, 28], "bottom": 21, "bound": [19, 20, 28, 33], "box": [6, 10, 33], "branch": [1, 7, 30], "break": [6, 16, 34], "brew": 5, "brief": [18, 28, 34], "briefli": 33, "bring": [2, 6, 7, 9, 15, 16, 21, 28, 31, 33, 34], "broad": [7, 9, 34], "broader": 34, "brought": [33, 34], "buffer": [2, 28], "bug": [1, 5, 34], "bui": 21, "build": [6, 28, 33, 34], "built": [7, 17, 20, 34], "busi": 33, "c": [1, 7, 8, 16, 17, 20, 26, 28, 31, 32, 33, 34], "c1": 20, "c10": [6, 17], "c620": 33, "cach": [2, 5, 7, 19, 20, 30, 34], "cache_weight_for_large_batch": 2, "caff": 3, "calcul": [1, 2, 8, 16, 21, 22], "cali_dataset": 34, "calib_dataload": [2, 6, 16, 34], "calib_dataset": [6, 29], "calib_evalu": 6, "calib_func": 2, "calib_sampl": 29, "calibr": [2, 13, 22, 26, 29, 30, 32, 34], "calibrated_model": 34, "calibration_data_load": [4, 6, 13], "calibration_data_set": [15, 34], "calibration_model": 29, "calibration_sampl": 6, "call": [2, 6, 8, 13, 17, 18, 21, 26, 32, 33, 34], "caller": [26, 34], "can": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 28, 29, 30, 31, 32, 33, 34], "cannot": [8, 19, 26, 31, 34], "canon": 18, "capabl": [3, 17, 34], "capac": [21, 30], "captur": [4, 34], "card": 18, "care": 32, "carri": 30, "case": [2, 6, 7, 9, 12, 16, 17, 18, 28, 31, 33, 34], "cases": 32, "cast": [2, 8, 21, 28], "casual": 26, "cat": [8, 31, 32, 34], "catch": 6, "categor": 7, "categori": [8, 34], "caus": [2, 7, 21, 26, 28, 31, 33, 34], "causal": 2, "cc": [5, 6, 17], "ccl": [6, 31, 34], "cd": [5, 6], "cdist": 8, "center": 34, "cento": 30, "cerr": 6, "certain": [1, 7, 26, 28, 29, 31, 33], "ch_axi": 2, "chain": 21, "chang": [2, 5, 6, 7, 8, 10, 11, 12, 15, 17, 18, 20, 23, 25, 26, 29, 31], "changed_onli": 5, "changelog": 34, "channel": [2, 3, 10, 15, 16, 26, 34], "channels_last": [6, 7, 18, 23, 33, 34], "char": 6, "charact": 5, "chat": [28, 34], "chatglm": [2, 28], "chatglm2": [28, 34], "chatglm3": [28, 34], "cheat": 23, "check": [2, 5, 6, 7, 13, 18, 28, 29, 31, 34], "check_trac": [6, 13, 32], "checkpoint": [2, 6, 29], "checkpoints_json": 29, "chip": 33, "chipset": 33, "choic": [6, 21, 23, 31, 34], "choleski": 8, "cholesky_invers": 8, "cholesky_solv": 8, "choos": [6, 8, 20, 23, 31, 33, 34], "chosen": [8, 14, 17], "chunk": 34, "chw": 18, "chwn": 18, "ci": 5, "cifar10": [6, 13], "circumst": 8, "clamp": 13, "clang": 5, "class": [2, 5, 6, 7, 8, 10, 16, 20, 26, 34], "classif": [26, 30], "claus": [7, 10, 19], "clean": 5, "clear": 10, "clibrat": 34, "click": 3, "clone": 5, "close": [18, 31, 33], "cloud": 3, "clr": 19, "cmake": [5, 6, 17, 34], "cmake_minimum_requir": 6, "cmakefil": 17, "cmakelint": 5, "cmakelist": 6, "cnn": [7, 18, 26, 30, 33, 34], "co": [2, 34], "coco": 30, "code": [1, 2, 5, 6, 7, 10, 11, 12, 13, 18, 19, 21, 23, 24, 26, 27, 29, 33, 34], "codegen": [2, 28, 34], "codeless": 31, "codellama": 28, "codenam": 34, "collabor": 3, "collate_batch": 6, "collate_fn": 6, "collect": [6, 32, 33, 34], "column": 6, "com": [2, 5, 34], "combin": [2, 12, 14, 28, 31, 34], "come": 33, "comma": 33, "command": [4, 5, 6, 14, 23, 31, 32, 33, 34], "comment": [5, 14, 17, 22, 34], "commit": 5, "common": [17, 21, 28, 31, 33], "commonli": [7, 28, 33, 34], "commun": [6, 28, 31, 32, 33, 34], "communication_backend_nam": 29, "compact": [31, 32, 33], "compar": [1, 2, 7, 13, 18, 21, 26, 28, 30, 31, 33, 34], "compat": [17, 21], "compet": 33, "competit": 33, "compil": [1, 5, 6, 23, 26, 33, 34], "complet": [5, 6, 14, 18, 22, 29, 33], "complex": 17, "complexdoubl": 17, "complexfloat": 17, "complic": [26, 31, 33], "complier": 17, "compon": [15, 26, 27, 28], "compos": [6, 13], "comprehens": [1, 34], "compressor": [3, 7, 16, 22, 34], "compris": 18, "compuat": 13, "comput": [2, 6, 7, 13, 15, 16, 18, 20, 21, 28, 30, 31, 32, 33, 34], "concat": [2, 20, 26, 28, 34], "concat_fp32_from_bf16": 21, "concat_linear": 2, "concat_output": 2, "concaten": [2, 21], "concept": [18, 33], "concern": 7, "conclud": [30, 34], "conclus": 18, "concurr": [32, 33], "conda": [5, 33], "conda_prefix": [31, 32], "condit": 27, "conduct": 7, "conf": [4, 13, 14, 31, 34], "conf_fil": [14, 34], "confer": 3, "config": [2, 6, 11, 23, 31, 32], "configur": [2, 4, 6, 7, 14, 15, 16, 17, 31, 32, 34], "confirm": 31, "conflict": [7, 17], "connect": 33, "consecut": 33, "consider": 16, "consist": [16, 28, 33, 34], "const": [6, 17], "constant": 13, "constraint": [2, 34], "construct": [2, 7, 13], "consum": [7, 14], "consumpt": 34, "contain": [2, 5, 6, 13, 17, 26, 31, 32, 33, 34], "containeraliasingtest": 5, "content": [29, 34], "context": [2, 5, 6, 8, 20, 28, 33, 34], "context_len": 2, "contigu": [6, 13, 18, 33, 34], "contiguous_format": [18, 33], "continu": [31, 32, 34], "contribut": [28, 31, 34], "control": [1, 2, 7, 20, 26, 31, 33, 34], "conv": [2, 8, 10, 13, 15, 20, 26, 34], "conv1d": [8, 13], "conv2": 20, "conv2d": [2, 7, 8, 10, 13, 18, 20, 26, 34], "conv3d": [8, 13, 34], "conv_bn": 2, "conv_bn_fold": [2, 26, 34], "conv_tbc": 8, "conv_transpose1d": 8, "conv_transpose2d": 8, "conv_transpose3d": 8, "conveni": [8, 34], "convers": [2, 8, 13, 34], "convert": [1, 2, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23, 26, 32, 34], "convert_model": [4, 13, 15, 16], "converted_model": [4, 6, 26, 34], "convolut": [2, 6, 7, 13, 20, 33, 34], "convolution1d": 34, "convolutuon": 2, "convrelu": 13, "convsumrelu": 13, "convtranspose2d": [2, 13], "convtranspose3d": 13, "coo": 18, "cooper": [7, 30, 34], "copi": [5, 17, 18], "copyright": [17, 27], "core": [2, 7, 14, 17, 30, 33, 34], "core_id": [2, 20, 31], "correct": [7, 18, 25, 34], "correspond": [20, 31, 34], "cosine_embedding_loss": 8, "cost": [2, 6, 28, 30, 33], "costli": 33, "could": [7, 13, 16, 18, 26, 32, 33, 34], "count": 31, "counterpart": [2, 7, 18, 34], "coupl": [20, 33, 34], "cout": 6, "cover": [13, 18, 31], "cpp": [5, 6, 33], "cppsdk": 34, "cpu": [1, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 19, 20, 23, 25, 26, 28, 30, 31, 32, 34], "cpu_capability_avx512": 17, "cpu_capability_avx512_bf16": 17, "cpu_featur": 17, "cpu_feature_main": 17, "cpu_launcher_arg": 32, "cpu_launcher_en": 32, "cpu_pool": [2, 20, 34], "cpu_pool1": 20, "cpu_pool2": 20, "cpuid": 17, "cpuinfo": 17, "cpunodebind": 33, "cpupool": [2, 20, 34], "crash": [31, 33, 34], "creat": [7, 16, 20, 33, 34], "creation": 2, "creator": 34, "credit": 17, "criteria": 16, "criterion": [6, 8, 16, 22], "cross": [32, 33, 34], "cross_entropy_loss": 8, "crossentropyloss": [6, 16], "csrc": 26, "csv": 14, "ctc_loss": 8, "cu": 5, "cu_seqlens_kv": 2, "cu_seqlens_q": 2, "cudnn": 18, "current": [1, 2, 5, 7, 11, 13, 14, 15, 16, 17, 19, 20, 26, 28, 29, 34], "current_posit": 2, "custom": [1, 2, 7, 26, 34], "customized_forward": 10, "cv": 34, "cvt_fp32_to_bf16": 17, "cvt_fp32_to_bf16_kernel_fn": 17, "cvt_fp32_to_bf16_kernel_impl": 17, "cvt_fp32_to_bf16_kernel_stub": 17, "cvtfp32tobf16": 17, "cvtfp32tobf16krnl": 17, "cxx": [6, 17], "cxx11": 34, "cxx_standard": 6, "d": [4, 5, 6, 7, 8, 13, 26, 28, 34], "d8": 33, "d__avx512f__": 17, "d__avx__": 17, "dag": 13, "daili": 34, "data": [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23, 26, 31, 32, 34], "data_typ": 18, "databrick": 28, "dataload": [2, 6, 10, 13, 16, 20, 22, 29, 34], "dataset": [6, 13, 16, 29, 30, 33, 34], "dataset_nam": [10, 34], "datatyp": [20, 34], "date": 34, "dcmake_prefix_path": 6, "dcpmm": 30, "dcpu_cap": 17, "dcpu_capability_amx": 17, "dcpu_capability_avx2": 17, "dcpu_capability_avx512": 17, "dcpu_capability_avx512_bf16": 17, "dcpu_capability_avx512_fp16": 17, "dcpu_capability_avx512_vnni": 17, "dcpu_capability_default": 17, "ddp": [2, 6], "ddr": 30, "ddr4": 33, "dealloc": 33, "debug": [2, 31], "debug_squad": [10, 34], "dec": 3, "decai": 7, "decid": [2, 15, 20, 28], "decim": 21, "declar": 17, "decltyp": 17, "decod": [2, 28, 30, 34], "deconv3d": 34, "decor": 2, "dedic": [2, 6, 28, 34], "deduct": 31, "deep": [3, 7, 8, 11, 13, 14, 21, 33], "deepcopi": 2, "deepspe": [2, 34], "def": [2, 6, 8, 10, 16, 20, 26, 34], "default": [2, 4, 6, 7, 10, 12, 13, 15, 16, 17, 20, 22, 23, 26, 28, 30, 32, 33, 34], "default_dynamic_qconfig": [15, 32], "default_dynamic_qconfig_map": 6, "default_dynamic_qconfigprepared_model": 4, "default_static_qconfig": [13, 15, 32, 34], "default_static_qconfig_map": 6, "default_static_qconfigprepared_model": 4, "defin": [2, 5, 6, 7, 8, 10, 16, 17, 18, 22, 32], "definit": [17, 21, 34], "degre": 34, "deinit": 5, "deliv": [7, 28, 34], "demand": [2, 7], "demonstr": [6, 18, 26, 32], "demostr": 23, "denomin": 2, "denot": 21, "dens": [7, 18], "dep": 34, "depend": [5, 7, 17, 18, 25, 26, 33, 34], "deploi": 34, "deploy": [2, 7, 13, 34], "deployment_mod": [2, 6, 23], "deprec": [3, 26], "dequant": [13, 16], "desc": 18, "describ": [8, 13, 18, 21, 32, 33], "descript": [4, 7, 16, 18, 20, 25, 33, 34], "descriptor": 34, "design": [2, 5, 8, 18, 21, 29, 34], "desir": [16, 31], "destroy_process_group": 6, "destruct": 33, "detail": [2, 5, 6, 7, 8, 9, 11, 13, 17, 18, 24, 25, 26, 28, 30, 32, 33, 34], "detect": [1, 6, 12, 17, 26, 33, 34], "detectron2": 18, "determin": [2, 6, 17, 21, 33], "develop": [1, 3, 6, 28, 30, 33, 34], "devic": [1, 2, 15, 29, 31, 34], "device_nam": [7, 8], "diagram": [18, 33], "dict": [2, 6, 23], "dictionari": 34, "did": [33, 34], "didn": 20, "differ": [1, 2, 7, 15, 16, 17, 18, 20, 28, 31, 32, 33, 34], "difficult": 18, "difficulti": 16, "diffus": [3, 34], "digit": 21, "dim": [2, 6, 18, 23], "dimens": [2, 18, 26], "dimm": 34, "dinner": [6, 23], "dir": [17, 31], "direct": [2, 5, 13], "directli": [2, 6, 33, 34], "directori": [1, 5, 6, 14, 29, 31, 32], "dirty_decay_m": [31, 33], "disabl": [2, 6, 7, 13, 26, 31, 33, 34], "disable_auto_channels_last": 9, "disable_iomp": [14, 32], "disable_numactl": [14, 32], "disadvantag": 21, "discret": 1, "discrete gpu": 1, "discuss": [5, 18, 33], "dispatch": [1, 34], "dist": 6, "dist_sampl": 6, "distilbert": 30, "distribut": [2, 3, 7, 16, 31, 32, 33, 34], "distributeddataparallel": [6, 34], "distributedsampl": 6, "div": 13, "divid": [2, 13, 31, 32, 33, 34], "divis": [2, 20], "divisor": [2, 20], "dl": [3, 7, 34], "dlopen": 20, "dlrm": [3, 7, 26, 30, 34], "dnnl": 30, "dnnl_verbos": 2, "do": [2, 5, 8, 16, 18, 20, 21, 26, 28, 30, 31, 32, 33, 34], "do_ev": [10, 34], "do_sampl": [6, 23], "doc": [1, 2, 5, 11, 29, 34], "doc_strid": [10, 34], "docker": [30, 34], "dockerfil": 34, "dockerhub": 34, "docstr": 5, "document": [0, 7, 17, 20, 29, 34], "doe": [2, 7, 13, 18, 20, 26, 34], "doesn": [2, 15, 16, 18, 26, 34], "dolli": [28, 34], "domin": [1, 7, 28], "don": [2, 5, 8, 14, 17, 34], "done": [6, 10, 16, 17, 26, 33, 34], "dot": [2, 7, 18, 28], "doubl": 17, "down": [5, 32, 34], "download": [6, 13, 16], "downstream": 8, "dpc": 1, "dpcpp": 34, "dram": 2, "dramat": [32, 33], "drawback": [2, 21], "drive": [1, 7, 28], "driven": 2, "drop": [31, 32], "dropout": [2, 10], "dst": 17, "dtype": [2, 4, 6, 7, 8, 10, 11, 13, 15, 16, 17, 23, 26, 29, 31, 34], "due": [1, 8, 10, 17, 20, 26], "dummi": 32, "dummy_tensor": 32, "dummymodul": 10, "dump": [2, 31], "durat": [2, 21], "dure": [4, 6, 7, 10, 13, 16, 21, 31, 33, 34], "dynam": [1, 4, 20, 28, 32, 33, 34], "dynamic_qconfig": 15, "dynamic_quantized_model": 6, "e": [1, 2, 6, 7, 8, 12, 16, 17, 18, 28, 31, 33, 34], "each": [2, 8, 14, 16, 17, 19, 20, 21, 31, 32, 33, 34], "eager": [1, 7, 11, 12, 23, 32, 34], "earli": [2, 34], "earlier": 21, "eas": [7, 18, 34], "easi": [1, 3, 21], "easier": [2, 18, 21], "easili": [10, 15], "ec2": 32, "edit": [5, 26, 34], "effect": [2, 17, 21, 26, 32, 33], "effici": [1, 7, 11, 19, 20, 28, 31, 33, 34], "effort": 34, "eig": 8, "einsum": 34, "either": [2, 26, 31], "el8_4": 30, "elaps": 33, "element": [2, 18, 19], "eleutherai": [2, 28], "elif": 6, "elimin": 28, "els": [6, 14, 17, 18, 23], "elser": 34, "eltwis": 34, "elu": 13, "emb": 7, "emb1": 7, "emb2": 7, "emb3": 7, "emb_m": 7, "embed": [2, 7, 28, 34], "embedding_bag": 10, "embedding_spec": 7, "embeddingbad": 34, "embeddingbag": [7, 26, 34], "embeddingspec": 7, "embedingbag": 7, "emblist": 7, "emerg": [1, 7, 28], "emphas": 33, "emply_lik": 2, "empow": 3, "empti": [18, 31], "enabl": [1, 2, 3, 4, 6, 7, 8, 10, 13, 16, 18, 20, 22, 23, 26, 28, 31, 32, 33, 34], "enable_auto_channels_last": 9, "enable_auto_mix_precis": 34, "enable_auto_mixed_precis": 34, "enable_auto_optim": 34, "enable_blockwise_loss": [16, 22], "enable_jemalloc": 32, "enable_onednn_fus": [2, 13], "enable_tcmalloc": 32, "encod": 34, "encount": [26, 34], "encourag": 34, "end": [6, 13, 20, 34], "endif": 17, "endl": 6, "engin": [1, 6, 18, 33], "enhanc": [1, 3, 28, 34], "enough": [2, 7, 19], "ensur": [11, 19, 20, 32, 34], "entir": [2, 16, 28], "enumer": [6, 13, 16, 29], "env": [6, 29], "env_key1": 5, "env_key2": 5, "env_val1": 5, "env_val2": 5, "environ": [2, 5, 6, 17, 20, 24, 28, 30, 31, 32, 33], "ep": [2, 7, 10, 19], "epoch": 16, "equal": [2, 15, 20, 31, 32, 33], "equip": 33, "equival": 34, "error": [2, 5, 6, 7, 10, 16, 18, 21, 22, 26, 34], "especi": [2, 5, 28, 34], "etc": [2, 5, 6, 17, 34], "eval": [2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "eval_func": [2, 16, 34], "eval_funct": 4, "evalu": [2, 16, 34], "even": [2, 5, 7, 33, 34], "evenli": 31, "everi": [2, 28], "exact": 2, "exactli": 21, "exampl": [2, 5, 7, 8, 13, 18, 19, 21, 22, 23, 24, 25, 28, 29, 32, 33, 34], "example_input": [2, 4, 6, 13, 15, 29, 32, 34], "example_kwarg_input": 2, "examplenet": 20, "examplenet1": 20, "examplenet2": 20, "exce": [26, 30, 33, 34], "except": [28, 31, 34], "excess": 34, "excit": 34, "exclus": 31, "execut": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19, 20, 26, 31, 32, 33, 34], "exetens": 2, "exhibit": 30, "exist": [1, 5, 7, 13, 26, 31, 33], "exit": [6, 31], "exp": 13, "expect": [2, 7, 30, 34], "expecttest": 5, "expens": 18, "experi": [5, 7, 10, 12, 16, 18, 26, 33, 34], "experiment": 34, "explain": [17, 18, 21], "explicit": [18, 20, 33], "explicitli": [2, 8, 16, 20, 26, 31, 34], "explor": 2, "expon": 21, "export": [4, 31, 33], "expos": 8, "express": [18, 34], "ext": [6, 34], "extend": [1, 18, 25, 33, 34], "extens": [2, 3, 4, 6, 9, 10, 13, 14, 16, 17, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34], "extra": [2, 5, 10, 20, 31, 32], "extra_rope_config": 2, "extrem": [7, 14, 33], "f": [5, 6, 13, 16, 28, 34], "f1": 30, "f16c": 17, "f32": [17, 18], "f401": [6, 11, 12, 13, 16, 23, 29], "face": 3, "facebook": [3, 6, 28], "facilit": 34, "fact": [18, 33], "factor": [2, 6, 16, 31], "fail": [10, 26, 34], "failur": [12, 34], "fake": 2, "fake_quantize_per_tensor_affin": 8, "falcon": [2, 28, 34], "fall": [6, 12], "fals": [2, 4, 6, 7, 8, 13, 14, 15, 16, 17, 20, 22, 23, 26, 31, 32, 34], "famili": [2, 28, 33, 34], "fashionmnist": 16, "fast": [4, 12, 33, 34], "fast_bert": [2, 4, 6, 7, 11, 34], "fast_layer_norm": [2, 34], "faster": [2, 6, 7, 8, 30, 33], "fastest": 17, "fastlayernorm": [2, 34], "fatal_error": 6, "fault": 34, "favorit": 31, "fb": 34, "feasibl": 10, "featur": [0, 1, 2, 3, 5, 8, 10, 13, 14, 18, 20, 23, 25, 26, 28, 30, 31, 32, 33, 34], "feb": 3, "feed": [2, 9, 18], "feedback": 34, "feedforward": 28, "feel": [5, 18, 34], "few": [5, 7, 9, 13, 16, 18, 32, 34], "fewer": 21, "fft_fft": 8, "fft_fft2": 8, "fft_fftn": 8, "fft_hfft": 8, "fft_ifft": 8, "fft_ifft2": 8, "fft_ifftn": 8, "fft_ihfft": 8, "fft_irfft": 8, "fft_irfft2": 8, "fft_irfftn": 8, "fft_rfft": 8, "fft_rfft2": 8, "fft_rfftn": 8, "figur": [1, 2, 21, 28, 33], "file": [2, 4, 5, 6, 8, 14, 15, 16, 17, 18, 31, 34], "filenam": 5, "find": [1, 2, 7, 14, 16, 23, 26, 30, 31, 34], "find_packag": 6, "findavx": 17, "fine": [3, 20, 29, 31, 32, 33, 34], "finer": [1, 7, 20], "finish": [6, 11, 12, 13, 16, 20], "first": [2, 3, 5, 6, 7, 9, 10, 12, 16, 19, 20, 21, 26, 31, 32, 33], "firstli": [2, 28], "fit": [5, 7, 33, 34], "fix": [2, 5, 7, 34], "flag": [2, 5, 7, 17, 20, 31, 34], "flake8": 5, "flan": 28, "flash": 34, "flash_atten_varlen": 2, "flatten": [16, 20], "flexibl": 34, "float": [2, 6, 7, 8, 14, 15, 16, 17, 21, 29, 34], "float16": [2, 8], "float32": [2, 13, 21, 23, 26, 30, 31, 34], "float64": 8, "flourish": 28, "flow": 26, "flush": [6, 23], "fma": 17, "fn_type": 17, "focu": [2, 10, 18, 29, 34], "focus": [13, 34], "fold": [2, 10, 15, 16, 26, 34], "folder": 5, "follow": [1, 2, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34], "footbal": 7, "footprint": [7, 21, 28, 34], "forg": 33, "fork": [17, 33], "format": [2, 5, 6, 7, 9, 14, 22, 26, 28, 31, 33, 34], "format_tag": 18, "former": 6, "formerli": [30, 33, 34], "formula": 21, "forward": [2, 6, 8, 13, 16, 20, 21, 26, 32, 33, 34], "found": [1, 6, 7, 14, 16, 18, 29, 31, 32, 33, 34], "foundat": [18, 33], "fp16": [2, 6, 17, 29], "fp32": [2, 4, 16, 17, 19, 21, 23, 28, 34], "fp32_gw": 21, "fp32_w": 21, "fpn": 30, "fraction": 21, "fractional_max_pool2d": 8, "fractional_max_pool3d": 8, "fragment": 33, "framework": [5, 34], "free": [31, 34], "freez": [6, 8, 10, 13, 15, 16, 20, 23, 26, 32, 34], "freezed_model": [26, 34], "frequenc": [2, 30], "frequent": 7, "friendli": [7, 33], "from": [1, 2, 3, 4, 5, 8, 10, 11, 13, 15, 16, 17, 18, 19, 20, 21, 23, 25, 28, 29, 31, 32, 33, 34], "from_embeddingbag_list": 7, "from_pretrain": [4, 6, 11, 23, 29, 32], "front": [13, 34], "frontend": [1, 2, 7, 20, 28, 34], "frozenbatchnorm": 34, "frozenbatchnorm2d": 7, "fsi": 34, "fulfil": 20, "full": [2, 5, 18, 32, 33, 34], "fulli": [5, 15, 17, 21, 31, 33, 34], "function": [2, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 20, 21, 23, 26, 28, 29, 31, 33, 34], "further": [1, 2, 5, 6, 7, 18, 20, 28, 33, 34], "fuse": [2, 7, 13, 16, 19, 28, 34], "fuse_update_step": 2, "fusion": [1, 2, 7, 10, 21, 28, 34], "futur": [7, 28, 34], "futuretensor": 20, "fx": [3, 7, 10, 26, 34], "g": [2, 7, 8, 16, 17, 18, 28, 34], "gain": [1, 7, 26, 28, 34], "game": 7, "gave": 14, "gb": 20, "gcc": 17, "gcp": 3, "gelu": [2, 13, 34], "gemm": [7, 18, 26, 28, 34], "gen": [3, 30, 34], "gen_": 2, "gen_id": [6, 23], "gen_text": [6, 23], "genai": [1, 7, 28], "gender": 7, "gener": [1, 5, 6, 7, 10, 12, 16, 17, 18, 21, 23, 28, 29, 30, 31, 32, 33, 34], "generate_kwarg": [6, 23], "genv": 31, "geomean": 34, "geqrf": 8, "get": [1, 2, 3, 4, 6, 7, 10, 11, 15, 17, 20, 21, 22, 26, 28, 29, 30, 31, 33, 34], "get_acceler": 29, "get_core_list_of_node_id": 2, "get_cpp_typesize_and_vecs": 17, "get_cpp_typesize_and_vecsize_kernel_fn": 17, "get_cpp_typesize_and_vecsize_kernel_impl": 17, "get_cpp_typesize_and_vecsize_kernel_stub": 17, "get_smooth_quant_qconfig_map": [2, 6, 29], "get_weight_only_quant_qconfig_map": [2, 6, 29], "getattr": [6, 23], "getveclength": 17, "getveclengthkrnl": 17, "gif": 31, "gil": 20, "git": [2, 5, 28], "github": [1, 2, 5, 6, 7, 8, 34], "give": [32, 34], "given": [2, 6, 13, 14, 16, 28], "global": [2, 20, 22, 34], "global_past_key_valu": 6, "gnu": [6, 17, 32], "go": [2, 5, 8], "gomp_cpu_affin": 33, "good": [1, 2, 5, 7, 12, 18, 19, 28, 33, 34], "googl": [3, 5, 28], "gperftool": 33, "gpertool": 33, "gpt": [2, 28, 30], "gpt2": 26, "gptbigcod": [2, 28], "gptj": 2, "gptjforcausallm": 2, "gptq": [2, 6, 34], "gpu": [1, 3, 18, 34], "grad": [7, 19], "grad0": 19, "grad1": 19, "grad_i": 19, "grad_n": 19, "gradient": 7, "grain": [1, 3, 7, 20], "granular": [2, 31, 32, 33], "graph": [1, 4, 8, 10, 16, 23, 26, 31, 34], "graph_for": 13, "graph_mod": [2, 4, 7, 12, 34], "graphic": 33, "great": 33, "greater": 2, "greedi": [6, 23], "grid": 14, "grid_sampl": 8, "grokk": 3, "ground": 21, "group": [2, 19, 20, 33], "group_norm": 8, "group_siz": 2, "gru": 15, "grucel": 15, "gt": [4, 14, 33], "gtest_filt": 5, "guid": [3, 6, 7, 17, 32, 34], "guidanc": 7, "guidelin": 18, "gw": 21, "h": [5, 6, 7, 16, 18, 26, 31, 32], "ha": [0, 1, 2, 7, 10, 14, 17, 18, 20, 21, 26, 28, 30, 31, 33, 34], "had": [6, 33], "half": [2, 7, 17, 21], "halv": 21, "handl": [6, 18, 33], "handler": 32, "hang": [33, 34], "happen": 7, "hard": [18, 26], "hardsigmoid": 34, "hardswish": [13, 34], "hardtanh": 13, "hardwar": [1, 3, 17, 25, 28, 32, 34], "hav": 17, "have": [1, 2, 5, 6, 7, 9, 14, 17, 18, 20, 21, 23, 26, 27, 28, 30, 31, 32, 33, 34], "head": [2, 34], "head_dim": 2, "head_map": 2, "head_mask": 2, "head_num": 2, "head_siz": 2, "header": 17, "heavi": 7, "heavier": 28, "height": 18, "hello": 5, "help": [2, 5, 6, 17, 23, 28, 31, 33, 34], "helper": 2, "here": [5, 8, 10, 13, 16, 17, 18, 20, 26, 32, 33, 34], "herebi": 16, "hero": 34, "heterogen": 34, "heurist": [2, 20, 34], "hf": [6, 28], "hf_beam_sampl": 34, "hf_beam_search": 34, "hf_greedy_search": 34, "hf_sampl": 34, "hidden": [2, 18, 28], "hidden_s": [2, 6], "hidden_st": 2, "high": [19, 21, 33], "higher": [2, 7, 13, 17, 18, 28], "higher_is_bett": 14, "highli": [7, 23, 28, 33, 34], "hinge_embedding_loss": 8, "hint": [2, 20], "histogram": [30, 34], "histogramobserv": [2, 15], "histori": [2, 14, 28], "hobbi": 7, "hold": [18, 33], "home": [31, 32], "homebrew": 5, "hood": 34, "hook": [10, 16], "hopefulli": 7, "host": [30, 34], "hostfil": 31, "hostnam": 31, "hotspot": 28, "how": [1, 2, 10, 15, 17, 18, 23, 28, 31, 32, 33, 34], "howev": [2, 5, 7, 8, 9, 16, 20, 26, 28, 31, 33, 34], "hp": 14, "hpc": 11, "html": [2, 5, 16], "http": [2, 5, 16, 34], "hub": 28, "huber_loss": 8, "hug": 3, "huge": [7, 14, 33], "hugginfac": 34, "huggingfac": [2, 6, 26, 28, 32, 34], "huggingface_transform": 32, "hurt": 20, "hw": 18, "hwc": 18, "hwio": 18, "hwn": 18, "hydra": 31, "hyper": [2, 30, 33, 34], "hyperparam": 14, "hyperparamet": [4, 7], "hyperparamt": 14, "hyperthread": 32, "hypertun": [4, 34], "hypertune_directori": 14, "hypervisor": 34, "hypothesi": 5, "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 26, 27, 28, 29, 30, 32, 33, 34], "i_mpi_pin_domain": 31, "iakv": [2, 28], "ic": 2, "ic_block": 2, "id": [2, 31, 32], "idea": [11, 21, 33], "ideep": [17, 18], "ident": [2, 10, 18], "identif": [6, 17], "identifi": 34, "idx": [2, 28, 31], "ieityuan": 28, "illeg": 34, "illustr": [18, 19, 21, 31, 33], "imag": [8, 13, 18, 33, 34], "image_classifi": 32, "imagenet": [18, 30], "immedi": 7, "immintrin": 17, "impact": [2, 7, 20], "imper": [20, 34], "impl": 17, "implement": [1, 5, 7, 11, 19, 26, 28, 33, 34], "implicit": 18, "implicitli": 6, "import": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 26, 28, 29, 32, 33, 34], "impract": [7, 14], "improv": [1, 3, 7, 8, 13, 20, 22, 28, 30, 32, 33], "in1": 7, "in2": 7, "in3": 7, "in_i": 7, "in_m": 7, "inaccur": 21, "inc": [16, 17, 22, 28], "includ": [1, 2, 5, 6, 7, 10, 14, 15, 17, 23, 26, 27, 28, 30, 34], "inclus": 33, "incorrect": [12, 26, 34], "increas": [1, 2, 3, 21, 26, 28, 30, 33, 34], "independ": 31, "index": [2, 5, 18, 28, 33], "index_copi": 8, "index_to_nam": 32, "indic": [2, 6, 18, 28], "indirect": 2, "indirect_access_kv_cache_attent": [2, 34], "indirectaccesskvcacheattent": [2, 34], "individu": [5, 30], "inductor": [7, 34], "inevit": 10, "inf": 14, "infer": [2, 3, 4, 7, 10, 11, 12, 15, 18, 20, 21, 23, 26, 30, 33, 34], "inferenc": 2, "inference2": 30, "inference3": 30, "inference_mod": [6, 23, 29], "influenc": [31, 33], "info": [2, 6, 17, 26, 31, 32, 34], "inform": [1, 2, 6, 7, 14, 17, 18, 28, 31, 32, 33, 34], "ingredi": 18, "init": [2, 5, 15, 34], "init_alpha": [16, 22], "init_distribut": 29, "init_infer": 29, "init_method": 6, "init_process_group": 6, "initi": [2, 20, 32], "inject": 34, "inlin": 17, "inplac": [2, 4, 6, 13, 15, 18, 23, 32], "input": [2, 6, 7, 9, 10, 13, 15, 16, 17, 18, 22, 23, 26, 29, 30, 32, 33, 34], "input1": 10, "input_channel": 2, "input_hint": 20, "input_id": [6, 23], "input_ids_pad": 6, "input_s": [6, 23], "input_split_hint": [2, 20], "input_tokens_length": [6, 23], "inputpath": 32, "insert": [2, 16], "insid": [2, 5, 20, 31], "inspir": 34, "instal": [4, 5, 6, 23, 25, 26, 28, 33, 34], "instanc": [2, 7, 10, 14, 32, 34], "instance_idx": 31, "instancenorm": 34, "instanti": 6, "instead": [7, 8, 14, 19, 20, 29, 30, 31, 32, 33, 34], "instruct": [1, 2, 5, 6, 7, 8, 17, 21, 23, 24, 25, 28, 30, 33, 34], "int": [2, 6, 7, 14, 17, 23, 26, 29, 31, 34], "int4": [2, 28, 29, 34], "int8": [1, 2, 3, 4, 17, 18, 20, 22, 28, 29, 34], "int8_qconfig": 6, "integ": [28, 31, 33], "integr": [7, 18, 28, 33, 34], "intel": [2, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 17, 20, 21, 22, 23, 25, 26, 27, 28, 29, 34], "intel discrete gpu": 1, "intel optim": 1, "intel_extension_for_pytorch": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 20, 23, 25, 29, 32, 34], "intel_pytorch_extens": [7, 25, 26, 34], "intel\u00ae extension for pytorch*": 1, "intend": 5, "intent": 5, "interact": [7, 34], "interconnect": 33, "interest": 5, "interfac": [5, 6, 18, 26, 28], "intern": [17, 18, 20, 32], "interpret": 31, "interrupt": 32, "intervent": 8, "intra": 2, "intrins": 17, "introduc": [1, 3, 7, 15, 18, 21, 22, 31, 33, 34], "introduct": [0, 2, 7, 28, 33, 34], "invalid": 33, "invers": 8, "investig": [2, 31], "invoc": [1, 7], "invok": [2, 6, 8, 10, 13, 20, 23, 26, 29, 34], "involv": 21, "io": 28, "iostream": 6, "ip": 31, "ipex": [1, 2, 3, 4, 6, 7, 9, 11, 12, 13, 15, 16, 17, 19, 20, 23, 26, 29, 31, 32, 34], "ipex_declare_dispatch": 17, "ipex_define_dispatch": 17, "ipex_en": 32, "ipex_fus": 2, "ipex_register_dispatch": 17, "ipexconfig": 6, "ipexrun": [4, 10, 31, 34], "is_caus": 2, "is_contigu": 18, "is_cus": 2, "is_dynam": [6, 15], "is_hyperthreading_en": 14, "is_runtime_ext_en": 2, "isa": [1, 34], "isa_codegen": 17, "isa_nam": 17, "isacodegen": 17, "issu": [1, 2, 5, 8, 21, 26, 33], "ital": 32, "item": 16, "iter": [2, 16, 21, 28, 34], "its": [2, 6, 7, 8, 14, 17, 21, 28, 30, 31, 32, 33, 34], "itself": [2, 5, 18], "ivalu": 6, "j": [2, 5, 17, 28, 30], "jan": 3, "je": 14, "jemalloc": [30, 32, 34], "jemallocl": 31, "jit": [1, 2, 5, 6, 7, 8, 13, 15, 16, 18, 20, 23, 26, 32, 34], "job": 5, "join": 33, "joint": 34, "joint_net": [26, 34], "json": [2, 6, 15, 16, 32, 34], "jul": 3, "jun": 3, "jupyt": 5, "just": [2, 14, 29, 34], "k": [2, 5], "kcpu": 17, "keep": [5, 12, 18, 21, 28, 32, 33, 34], "kei": [2, 7, 28, 34], "kept": 21, "kernel": [1, 2, 7, 20, 26, 28, 30, 33, 34], "kernel_s": 10, "key_cach": 2, "key_token": 2, "keystrok": 5, "keytensor": 2, "keyword": 2, "kill": 32, "kind": 7, "kineto_librari": 6, "kl_div": 8, "kmp": [31, 33], "kmp_": 20, "kmp_affin": [31, 32, 33], "kmp_blocktim": [31, 32, 33], "knob": [2, 4, 12, 31], "know": 5, "knowledg": 33, "known": [6, 10, 28], "kt": 3, "kv": 2, "kv_cach": [2, 28], "kwarg": [2, 29], "l1318": 2, "l1_loss": 8, "l2": 33, "l23": 2, "l4": 2, "l50": 2, "l76": 2, "label": 8, "lake": [7, 30, 34], "lamb": [19, 21], "land": [7, 34], "landscap": [1, 7, 28], "languag": [1, 2, 23, 24, 25, 26, 29, 34], "lar": 34, "larg": [1, 2, 19, 23, 24, 25, 26, 29, 30, 33, 34], "larger": [2, 20, 30, 31, 33, 34], "last": [3, 10, 21, 26, 34], "last_ind": 6, "latenc": [3, 14, 18, 28, 30, 32, 34], "later": [2, 7, 25, 33], "latest": [1, 2, 25, 28, 30, 34], "launch": [4, 6, 20, 32, 34], "launcher": [7, 13, 31, 33, 34], "law": 7, "layer": [2, 16, 20, 22, 28, 34], "layer_past": 2, "layernorm": [2, 13, 16, 22, 34], "layernorm_modul": 2, "layout": [2, 26, 34], "lazi": 5, "ld": 31, "ld_preload": [20, 31, 32, 33], "ldd": 6, "lead": 28, "leaki": 13, "leaky_relu": 13, "leakyrelu": 34, "learn": [3, 7, 8, 11, 13, 14, 21, 31, 33], "learning_r": [10, 34], "leav": [2, 20, 33], "left": [21, 28, 32], "legal": 34, "legend": 28, "len": [2, 6, 7, 13, 16, 17], "length": [2, 5, 14, 21, 26, 30, 34], "less": [2, 8, 18, 20, 26, 34], "let": [5, 10, 18, 19, 20, 21], "level": [7, 10, 13, 16, 18, 20, 21, 26, 33, 34], "leverag": [1, 7, 11, 28, 32, 34], "lib": [6, 31, 32], "lib64": [31, 32], "libc10": 6, "libdnnl_graph": 6, "libgomp": 33, "libintel": [6, 34], "libiomp": 33, "libiomp5": [20, 31, 32, 33], "libjemalloc": 31, "libpytorch_path": 6, "librari": [1, 2, 5, 6, 7, 17, 20, 32, 33, 34], "libtcmalloc": [31, 32], "libtorch": [6, 34], "libtorch_cpu": 6, "libxsmm": 2, "licens": 17, "lighter": 8, "lightweight": 34, "like": [1, 2, 3, 5, 6, 7, 8, 14, 18, 19, 21, 26, 28, 31, 33, 34], "limit": [5, 8, 10, 20, 26, 32, 33, 34], "linalg_choleski": 8, "linalg_cholesky_ex": 8, "linalg_cond": 8, "linalg_eig": 8, "linalg_eigh": 8, "linalg_eigv": 8, "linalg_eigvalsh": 8, "linalg_householder_product": 8, "linalg_inv": 8, "linalg_inv_ex": 8, "linalg_lstsq": 8, "linalg_matrix_rank": 8, "linalg_qr": 8, "linalg_solv": 8, "linalg_svd": 8, "linalg_svdv": 8, "linalg_tensorinv": 8, "linalg_tensorsolv": 8, "line": [5, 10, 13, 18, 31, 32, 33], "linear": [2, 6, 7, 8, 13, 15, 16, 18, 26, 33, 34], "linear2silumul": [2, 34], "linear_": 2, "linear_bn": 2, "linear_bn_fold": 2, "linear_m": 2, "linear_m_modul": 2, "linear_modul": 2, "linear_relu_stack": 16, "linear_s_modul": 2, "linearadd": [2, 34], "linearaddadd": [2, 34], "lineargelu": [2, 34], "linearize_indices_and_offset": 7, "linearmul": [2, 34], "linearnewgelu": [2, 34], "linearrelu": [2, 34], "linearsilu": [2, 34], "linearsilumul": [2, 34], "link": [1, 6, 17, 34], "linux": [5, 6, 17, 30, 31, 33], "list": [2, 5, 7, 8, 13, 14, 16, 18, 25, 29, 31, 32, 33, 34], "liuhaotian": 28, "live": 5, "ll": [5, 32, 33], "llama": [2, 3, 6, 28, 34], "llama2": [30, 34], "llama3": 34, "llava": [2, 28], "llm": [1, 16, 22, 24, 25, 34], "load": [1, 2, 6, 7, 13, 15, 16, 17, 23, 29, 32, 34], "load_dataset": 6, "load_qconf_summari": 15, "load_state_dict": [2, 34], "loader": 16, "local": [6, 20, 28, 31, 32, 33], "locat": [5, 17, 34], "log": [4, 6, 13, 31, 32, 34], "logic": [2, 14, 18, 32, 33], "login": 6, "logit": 16, "long": [2, 6, 18, 21, 26, 28, 34], "long_factor": 2, "longer": [26, 30, 34], "longform": 26, "look": [5, 14, 16, 18], "loop": [5, 21, 29], "lose": 21, "loss": [2, 5, 6, 8, 16, 18, 21, 26], "loss_fn": 16, "lot": [28, 34], "low": [3, 4, 6, 7, 21, 23, 31, 33, 34], "low_cpu_mem_usag": [6, 23], "low_precision_checkpoint": [2, 6, 29], "lower": [2, 8, 17, 21, 28, 34], "lowest": 2, "lowp": [2, 6], "lowp_mod": [2, 6, 29], "lr": [6, 7, 8, 16, 19], "lr_decai": 19, "lsb": 17, "lscpu": 33, "lstm": [2, 10, 15, 34], "lstmcell": 15, "lstsq": 8, "lt": [4, 30], "lu_solv": 8, "m": [4, 14, 20, 26, 31, 32, 33, 34], "m6i": [30, 32], "m7i": 30, "machin": [3, 5, 6, 7, 14, 17, 26, 31, 32, 33, 34], "maco": 5, "macro": 17, "made": [5, 34], "mai": [1, 2, 3, 5, 6, 7, 8, 9, 16, 17, 18, 20, 26, 31, 32, 33, 34], "main": [1, 2, 5, 6, 14, 20, 31, 32], "mainli": [31, 34], "maintain": 8, "major": 16, "make": [2, 5, 6, 7, 14, 15, 17, 21, 23, 28, 32, 33], "make_tupl": 17, "makefil": 5, "malloc": [14, 31, 33], "malloc_conf": [31, 33], "mamx": 17, "man": [7, 33], "manag": [2, 8, 13, 20, 28, 31], "mandatori": 14, "mani": [5, 14, 28, 31, 33, 34], "manipul": 18, "mantissa": 21, "manual": [2, 7, 10, 14, 18, 20, 34], "manual_se": [6, 11], "map": [2, 6, 18, 30], "mar": [3, 32], "margin_ranking_loss": 8, "mask": [2, 7, 17, 26], "mask_valu": 17, "maskrcnn": [33, 34], "maskrnn": 34, "master": [2, 7, 21, 31], "master_addr": 6, "master_port": 6, "match": [2, 8, 17, 31], "math": 7, "matmul": [2, 8, 13, 26, 34], "matrix": [1, 6, 7, 25, 28], "matrix_rank": 8, "matur": 34, "mavx2": 17, "mavx512bf16": 17, "mavx512bw": 17, "mavx512dq": 17, "mavx512f": 17, "mavx512fp16": 17, "mavx512vl": 17, "mavx512vnni": 17, "max": [2, 6, 16, 17, 22, 23, 26, 34], "max_context_len": 2, "max_new_token": [6, 23], "max_num_blocks_per_seq": 2, "max_position_embed": 2, "max_seq": 2, "max_seq_len": 30, "max_seq_length": [10, 34], "max_seqlen_k": 2, "max_seqlen_kv": 2, "max_seqlen_q": 2, "max_trial": 14, "max_unpool2d": 8, "max_unpool3d": 8, "maxim": 14, "maximum": [2, 16, 17], "maxpool": 34, "maxpool2d": 13, "maycontainalia": 5, "md": 18, "me": 18, "mean": [2, 16, 17, 18, 20, 22, 28, 34], "meant": 34, "meanwhil": [12, 33, 34], "measur": [30, 34], "mechan": [1, 7, 17, 21, 34], "medium": [28, 34], "meet": [21, 33, 34], "meltdown": 30, "membind": 33, "memori": [2, 6, 7, 8, 9, 10, 13, 19, 20, 21, 26, 28, 30, 32, 34], "memory_format": [6, 7, 18, 23], "mention": [3, 10, 20, 21, 34], "merg": [0, 7, 34], "merged_emb": 7, "merged_input": 7, "mergedembeddingbag": 7, "mergedembeddingbagwith": 7, "mergedembeddingbagwithsgd": 7, "merit": 18, "mermori": 2, "messag": [2, 6, 10, 12, 18, 31], "meta": [6, 18, 28, 29, 34], "metadata_thp": [31, 33], "method": [2, 8, 15, 16, 18, 22, 26, 33, 34], "method1": 10, "method2": 10, "methodologi": [2, 6, 7, 19, 33], "methond": 15, "metric": [2, 16, 30], "mfma": 17, "mha": [2, 34], "mhz": 33, "microarchitectur": [33, 34], "microsoft": [2, 28], "might": [2, 7, 18, 26, 33, 34], "migrat": 7, "millisecond": 33, "min": [2, 16, 22, 26, 34], "mind": [18, 32], "mini": [2, 20, 28, 34], "minim": [7, 14, 17, 33], "minimum": [14, 16, 18], "minmax": 34, "minmaxobserv": [2, 6, 15], "misc": 34, "mish": 13, "miss": 5, "mistral": [2, 28, 34], "mistralai": 28, "mitig": [20, 30], "mix": [2, 6, 13, 23, 26, 28, 34], "mixed_dtyp": 34, "mixtral": [2, 28], "mixtur": [8, 34], "mkdir": 6, "mkl": 34, "mkldnn": 18, "mkldnn_util": 18, "mllama": 2, "mlp": 34, "mm": 8, "mmuzzy_decay_m": 33, "mmx": 17, "mno": 17, "mobilenet": 30, "mode": [1, 2, 5, 7, 10, 12, 18, 20, 23, 26, 32, 34], "model": [1, 2, 3, 4, 8, 9, 10, 11, 12, 14, 16, 23, 24, 25, 26, 29, 30, 33, 34], "model1": 20, "model2": 20, "model_execut": 34, "model_id": [6, 23], "model_log": 32, "model_name_or_path": [10, 29, 34], "model_script": 20, "model_service_work": 32, "model_state_dict": 6, "model_stor": 32, "model_to_be_calibr": 34, "modelfamili": 28, "modeling_llama": 2, "modelurl": 32, "modern": 3, "modifi": [2, 5, 6], "modul": [1, 6, 7, 8, 13, 16, 17, 26, 29, 31, 34], "modular": 2, "modulist": 7, "momentum": [6, 10, 21], "monkei": 10, "more": [1, 2, 5, 6, 7, 8, 10, 11, 13, 16, 17, 19, 20, 21, 23, 26, 28, 32, 33, 34], "moreov": [1, 2, 28], "mosaicml": 28, "most": [2, 6, 7, 13, 21, 28, 30, 32, 33, 34], "motherboard": 33, "motiv": [2, 20], "move": [18, 33], "movingaverageminmax": 34, "mp_size": 29, "mpi": 31, "mpiexec": 31, "mpt": [2, 28, 34], "mrdimm": 34, "mrpc": 30, "mse_loss": 8, "much": [15, 18, 21, 28, 31, 33], "mul": [2, 13, 16], "multi": [2, 7, 14, 20, 28, 31, 33, 34], "multi_margin_loss": 8, "multi_stream": 2, "multi_stream_input_hint": 34, "multi_stream_model": [20, 34], "multi_stream_output_hint": 34, "multidimension": 18, "multiheadattent": 28, "multilabel_margin_loss": 8, "multilabel_margin_loss_forward": 8, "multipl": [2, 5, 7, 8, 16, 17, 18, 26, 28, 30, 32, 33, 34], "multiplex": 34, "multipli": 2, "multistreammodul": [2, 7, 20, 26, 34], "multistreammodulehint": [2, 20, 34], "multithread": 33, "must": [2, 5, 14, 17, 19], "mutual": 31, "muzzy_decay_m": [31, 33], "my": 18, "mykernel": 17, "mymodel": 34, "mypi": 5, "n": [2, 6, 7, 16, 18, 19, 20, 26, 32, 33, 34], "n1": 18, "n2": 18, "n_iter": 32, "name": [2, 5, 7, 14, 17, 25, 28, 31, 32, 33, 34], "namespac": [8, 17], "nan": [17, 34], "nanquantil": 8, "narg": 6, "narrow": 5, "nativ": [1, 6, 7, 8, 17, 19, 21, 26, 28, 34], "natur": [18, 21, 28], "naver": 3, "nb": 18, "nc": 32, "nchw": [7, 33], "ncore": [10, 31], "ncore_per_inst": [14, 34], "ncores_per_inst": 14, "nd": 18, "necessari": 18, "necessarili": 2, "neck": 19, "need": [2, 5, 6, 7, 10, 13, 14, 16, 17, 18, 19, 20, 21, 23, 26, 29, 31, 32, 33, 34], "need_linearize_indices_and_offset": 7, "neelnanda": 6, "neg": 21, "neglig": 18, "neighbor": 2, "neox": [2, 28], "net": 34, "network": [1, 3, 7, 8, 20, 25, 28, 33], "neural": [1, 3, 7, 16, 22, 25, 28, 33, 34], "neuralnetwork": 16, "new": [3, 5, 12, 16, 17, 18, 20, 23, 26, 29, 33], "new_gelu": 2, "new_layer_past": 2, "newer": [1, 28, 33], "newgeluactiv": 2, "newkernel": 17, "newkernelkrnl": 17, "newli": 34, "newlin": 5, "next": [5, 7, 34], "nf4": [2, 29], "nhwc": [7, 33, 34], "nifti": 33, "ninstanc": [10, 14, 31, 34], "nint": 5, "nll_loss": 8, "nll_loss2d": 8, "nlp": [6, 7, 26, 30, 34], "nm": [7, 34], "nn": [2, 6, 7, 8, 10, 13, 15, 16, 18, 20, 26, 34], "nnc": 26, "nnode": 31, "no_grad": [4, 6, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "node": [2, 20, 30, 32, 33, 34], "node0": 33, "node1": 33, "node_id": [2, 20, 31, 32, 34], "non": [2, 5, 8, 13, 18, 30, 32, 34], "noncontigu": 18, "none": [2, 6, 29, 31], "noqa": [6, 11, 12, 13, 16, 23, 29], "normal": [1, 2, 6, 7, 13, 20, 28, 33, 34], "normalized_shap": 2, "note": [2, 3, 5, 6, 15, 16, 17, 18, 20, 22, 24, 28, 30, 31, 32, 33], "notfound": 6, "noth": 2, "notic": [27, 31, 32], "nov": 3, "now": [2, 7, 15, 18, 32, 33, 34], "np": [16, 31], "nproc": 31, "nth": [32, 33], "num": [2, 20, 32, 33, 34], "num_attention_head": 6, "num_beam": [6, 23], "num_block": 2, "num_featur": 7, "num_head": 2, "num_hidden_lay": 6, "num_kv_head": 2, "num_nod": 14, "num_seq": 2, "num_stream": [2, 20, 34], "num_token": 2, "num_train_epoch": [10, 34], "numa": [2, 20, 31, 32, 34], "numactl": [20, 31, 32], "number": [1, 2, 5, 6, 7, 14, 16, 19, 20, 21, 26, 32, 34], "numer": [2, 8, 33], "numpi": 16, "o": [6, 17, 23, 30, 34], "o0": [2, 26, 34], "o1": [2, 26, 34], "o3": 17, "object": [2, 6, 7, 14, 17, 20, 33, 34], "observ": [2, 9, 13, 15, 34], "obsev": 15, "obtain": 16, "obviou": 28, "occupi": 26, "occur": 34, "occurr": 28, "off": [7, 8, 21, 28, 30, 34], "offer": [1, 5, 33], "offici": [5, 32, 33, 34], "offlin": 34, "offset": [2, 18, 28], "often": 7, "old": 34, "omp": [20, 26, 31, 32, 33, 34], "omp_num_threa": 26, "omp_num_thread": [20, 26, 31, 32, 34], "omp_proc_bind": [31, 33], "omp_schedul": [31, 33], "omp_set_num_thread": 34, "onboard": [19, 33], "onc": [2, 5, 6, 14, 17, 18, 20, 21, 32, 33], "ondevic": 29, "one": [2, 5, 7, 12, 13, 14, 16, 18, 19, 20, 26, 29, 31, 33, 34], "oneapi": [6, 33], "oneccl": [3, 6, 31, 34], "oneccl_bindings_for_pytorch": 6, "onednn": [2, 3, 13, 17, 26, 28, 34], "onednn_primitive_cache_capac": 33, "onednn_verbos": 4, "ones": [2, 6, 17], "onli": [1, 2, 5, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 26, 28, 31, 32, 34], "onlyquantizationint4": 28, "onlyquantizationint8": 28, "oob": [10, 34], "op": [2, 7, 15, 16, 22, 28, 34], "op_type_dict": 2, "open": [1, 16, 28, 33], "openai": 28, "openmp": [2, 7, 20, 26, 30, 32, 34], "oper": [1, 2, 6, 8, 13, 15, 21, 32, 33, 34], "opportunit": 2, "opt": [2, 6, 17, 28], "optdecoderlay": 16, "optim": [1, 3, 4, 6, 8, 9, 11, 12, 14, 16, 18, 20, 21, 23, 25, 26, 31, 32, 33, 34], "optimize_lstm": 2, "optimize_transform": 34, "optimized_model": [2, 34], "optimized_optim": 2, "optimizer_state_dict": 6, "optimum": 10, "optin": 2, "option": [1, 2, 5, 7, 10, 14, 15, 16, 29, 31, 34], "optyp": 2, "order": [2, 17, 18, 21, 31, 33, 34], "org": [2, 7, 16, 26, 34], "organ": 18, "orgqr": 8, "origin": [2, 6, 7, 12, 13, 15, 17, 20, 29, 34], "original_max_position_embed": 2, "original_model": 2, "ormqr": 8, "other": [2, 6, 7, 8, 14, 17, 18, 19, 23, 28, 31, 33], "other_1": 2, "other_2": 2, "other_arg": 19, "otheriws": 13, "otherwis": [2, 7, 20], "our": [5, 16, 19, 28, 33, 34], "out": [2, 5, 6, 7, 8, 10, 13, 16, 19, 20, 30, 31, 33, 34], "outlier": [7, 16], "outplac": [18, 34], "output": [2, 6, 7, 8, 13, 14, 16, 18, 23, 26, 34], "output_concat_hint": [2, 20], "output_dir": [10, 14, 34], "output_hint": 20, "output_tokens_length": [6, 23], "outsid": 20, "outstand": 5, "over": [5, 7, 8, 9, 16, 18, 30, 31, 34], "overal": 33, "overflow": [26, 34], "overhead": [1, 2, 7, 10, 19, 20, 26, 28, 33, 34], "overlap": 32, "overrid": 15, "overridden": [2, 17], "oversize_threshold": [31, 33], "overview": [7, 25, 34], "overwrit": [2, 31], "own": [2, 6, 15, 28], "owner": 13, "p": 34, "p29": 30, "p90": 30, "pack": [2, 20, 34], "packag": [1, 2, 5, 6, 7, 10, 23, 25, 26, 32, 33, 34], "pad": [8, 10, 20, 34], "pad_max": 6, "pad_val": 6, "padding_mod": 34, "page": [2, 6, 13, 20, 24, 29, 30, 33, 34], "pagedattent": [2, 34], "paper": [2, 34], "parallel": [2, 5, 6, 7, 28, 33, 34], "param": [2, 19, 31], "param_i": 19, "param_n": 19, "paramet": [2, 6, 7, 8, 10, 16, 17, 19, 20, 21, 26, 28, 29, 30, 31, 33, 34], "parse_arg": [6, 23], "parser": [6, 23], "part": [3, 5, 7, 8, 18, 21, 26, 31, 33, 34], "parti": 34, "partial": 7, "particular": [5, 6, 8, 29, 34], "partit": [13, 33], "pass": [1, 2, 5, 10, 17, 20, 26, 32, 34], "past": 28, "past_key_valu": [2, 6], "past_kv_length": 2, "patch": [10, 34], "path": [2, 6, 7, 14, 18, 20, 23, 31, 33, 34], "pattern": [7, 11, 18, 28, 34], "pdf": 2, "pdropout": 2, "peak": [2, 7, 11, 34], "penal": 33, "pend": 34, "per": [2, 10, 15, 16, 20, 30, 31, 32, 33, 34], "per_batch": 2, "per_batch_ic_block": 2, "per_batch_ic_block_sym": 2, "per_channel_symmetr": [2, 6, 15], "per_device_train_batch_s": [10, 34], "per_ic_block": 2, "per_tensor": 2, "per_tensor_affin": [6, 15, 34], "per_tensor_symmetr": 15, "perchannelminmaxobserv": [2, 6, 15], "perf": [11, 18], "perform": [1, 2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 15, 16, 18, 19, 21, 25, 28, 29, 31], "period": 33, "person": 3, "perspect": [2, 13, 18, 21, 28, 31, 33], "pertain": 17, "phase": [2, 20], "phi": [2, 28, 34], "physic": [2, 14, 20, 32, 33], "pick": 5, "piec": [2, 20], "pile": 6, "pin": [2, 20], "pinvers": 8, "pip": [4, 5, 33, 34], "pip3": 34, "place": [2, 8, 28, 33, 34], "placeholderobserv": [6, 15], "placement": 33, "plai": [7, 33], "plan": [5, 7, 10], "platform": [3, 7, 18, 32, 33, 34], "platinum": [14, 30, 32, 33], "pleas": [2, 6, 7, 11, 16, 22, 26, 28, 29, 31, 33, 34], "plu": 33, "pmi_rank": 6, "pmi_siz": [6, 29], "point": [2, 6, 8, 15, 21, 33, 34], "pointer": 17, "poisson_nll_loss": 8, "polar": 8, "polici": 33, "polish": 34, "polymorph": 17, "pool": [2, 20, 34], "poor": [26, 34], "popular": [1, 7, 22, 28, 30, 34], "popup": 5, "port": 31, "portabl": 11, "portion": 16, "pos_embd_dim": 2, "posit": [2, 28, 33, 34], "position_id": [2, 6], "position_ids_pad": 6, "possibl": [2, 14, 15, 19, 28, 31, 33, 34], "post": [2, 4, 5, 7, 15, 28, 34], "potenti": [3, 7, 34], "pow": 13, "power": [2, 7, 33, 34], "ppn": 31, "pr": [7, 18, 34], "practic": [6, 21, 24, 28, 33], "pragma": 17, "pre": [2, 28, 34], "precis": [2, 4, 6, 13, 21, 23, 26, 30, 34], "pred": 16, "predefin": 2, "predict": 16, "prefer": [1, 7, 8, 15, 24], "prefetchw": 17, "prefetchwt1": 17, "prefil": [2, 34], "prefix": [31, 34], "preload": [2, 31], "prepack": [2, 6, 10, 18, 26, 34], "prepar": [2, 4, 6, 13, 16, 26, 29, 32, 34], "prepared_model": [2, 4, 6, 13, 15, 16, 26, 29, 34], "prerequisit": [5, 6], "present": 32, "pretrain": [6, 32, 34], "pretti": 33, "prevent": 19, "previou": [14, 16, 18, 33, 34], "previous": 32, "primari": 33, "primarili": [8, 34], "primit": [11, 20, 30, 34], "principl": [3, 18], "print": [6, 11, 12, 13, 14, 16, 17, 23, 31], "printf": 5, "prior": [2, 23], "privat": 34, "probabl": 2, "problem": [7, 19, 26, 32, 33], "proc": 31, "procedur": 32, "process": [2, 6, 7, 11, 12, 14, 16, 19, 20, 21, 26, 31, 32, 33], "processor": [3, 7, 19, 21, 28, 30, 33, 34], "proclist": 33, "prod": 8, "produc": [5, 8], "product": [1, 2, 7, 14, 28, 34], "program": [1, 5, 7, 11, 20, 31, 33, 34], "progress": [26, 28, 34], "project": [1, 6], "prompt": [4, 6, 23, 34], "propag": [13, 21, 33], "proper": 34, "properli": 31, "properti": [6, 32], "propos": [5, 7, 11, 16, 18, 21], "prototyp": [4, 13, 20, 26, 34], "provid": [1, 2, 5, 6, 7, 8, 11, 12, 13, 14, 16, 20, 22, 24, 26, 28, 29, 31, 32, 33, 34], "pseudo": [19, 21, 34], "pseudocod": [26, 34], "pt": [6, 13, 14, 15, 23, 32, 34], "pth": 6, "pthread": 20, "ptmalloc": 32, "ptq": 7, "public": 34, "publish": 34, "pull": 5, "purlei": 33, "purpos": [17, 31, 32, 33], "push": 34, "push_back": 6, "put": 33, "py": [2, 5, 10, 14, 20, 31, 32, 34], "pyg": 3, "pyi": 5, "pypi": [26, 34], "python": [1, 2, 4, 10, 14, 17, 20, 26, 28, 29, 31, 32, 33, 34], "pytorch": [2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 16, 17, 20, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34], "q": [2, 28], "qa": [10, 34], "qconf_summari": [6, 15, 16, 29], "qconfig": [2, 4, 6, 13, 16, 26, 29, 32, 34], "qconfig_map": 6, "qconfig_summary_fil": [2, 6, 29], "qconfig_summary_file_path": 29, "qconfigmap": 6, "qint8": [2, 6, 15], "qkv": 34, "qparam": 15, "qr": 8, "qscheme": [2, 6, 15, 34], "qualiti": 34, "quant": [2, 16], "quant_method": 2, "quant_stat": 15, "quantconf": 34, "quantil": 8, "quantiz": [1, 3, 4, 13, 22, 26, 28, 30, 32, 34], "quantizat": 2, "quantization_config": [2, 6, 29], "quantize_per_tensor": 26, "quantized_model": [13, 15, 34], "queri": [2, 17, 18], "query_roteri": 2, "query_token": 2, "question": [18, 30], "quick": [1, 20, 24, 25], "quick_check": 5, "quickli": 2, "quicklint": 5, "quickstart_tutori": 16, "quint8": [6, 15], "quit": [17, 34], "qwen": [2, 28, 34], "qwen2": [28, 34], "r": [5, 6, 7, 14, 23, 30, 32, 33], "rais": [2, 10], "rand": [6, 8, 12, 13, 20, 26, 34], "randint": [6, 11, 32], "randn": [2, 10, 13, 16, 18, 32, 34], "random": 14, "rang": [1, 6, 7, 15, 16, 19, 21, 26, 31, 32, 34], "rank": [6, 31, 34], "rapid": 3, "rate": 21, "rather": [2, 18], "ratio": [22, 30, 34], "raw": 2, "rc": 34, "rc3": 34, "re": [5, 8, 32, 33, 34], "reach": 34, "read": [7, 19], "readm": 34, "real": [2, 7, 14, 15, 30, 34], "realli": 5, "realtim": 30, "reason": [2, 10, 18, 20, 34], "rebas": [5, 34], "receip": [16, 20], "receipt": 20, "receiv": 21, "recent": [6, 7, 18], "recip": [2, 4, 7, 13, 15, 26, 28, 34], "recognit": 33, "recommend": [1, 5, 6, 7, 9, 10, 15, 16, 20, 23, 30, 31, 33, 34], "record": [14, 32], "recov": 21, "recurs": 5, "reduc": [1, 2, 7, 15, 19, 20, 21, 22, 26, 28, 33, 34], "reduce_rang": 15, "reduct": 34, "refer": [1, 7, 9, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 32, 34], "refin": 34, "reflection_pad1d": 8, "reflection_pad2d": 8, "regard": 13, "regardless": [8, 34], "region": [2, 8, 17, 33], "regist": [1, 7, 10, 16, 17, 34], "registr": 7, "regress": [9, 34], "regular": [6, 21], "reinstal": [5, 26], "reinterpret": 18, "reinterpret_cast": 17, "rel": [2, 4, 16, 31, 34], "relat": [2, 6, 13, 17, 31, 33, 34], "releas": [1, 17, 18, 26, 30, 33], "reli": [18, 20], "relu": [2, 7, 13, 16, 18, 26, 34], "relu6": 34, "remain": 32, "remaind": [2, 20], "remark": [26, 30, 33], "remot": 33, "remov": [2, 5, 21, 34], "reorder": [2, 18, 28], "reorder_cach": 28, "repeat": [10, 18, 21], "repeatedli": 5, "replac": [2, 5, 7, 10, 26, 34], "replace_dropout_with_ident": 2, "replication_pad1d": 8, "replication_pad2d": 8, "replication_pad3d": 8, "repo": [5, 6, 7], "repo_root": 29, "report": [1, 17], "repres": [5, 7, 21], "represent": 18, "reproduc": 32, "request": [1, 5, 20, 32], "requir": [2, 5, 6, 8, 10, 16, 18, 21, 26, 28, 29, 31, 32, 34], "research": 28, "reserv": 33, "reshape_and_cach": 2, "residu": 31, "resiz": [6, 13], "resnet18": 34, "resnet18_xpu": 34, "resnet34": [30, 34], "resnet3d": 34, "resnet50": [12, 13, 14, 18, 30, 31, 33, 34], "resnet50_weight": [6, 12, 13], "resnext": 30, "resnext101": [18, 34], "resnext3d": 34, "resolv": 34, "resourc": [13, 20, 28, 32, 33], "respect": [14, 16, 30, 31, 34], "respons": 30, "rest": 32, "restart": 32, "result": [1, 2, 6, 10, 12, 14, 16, 18, 20, 21, 30, 31, 32, 33], "retinanet": 34, "retriev": 33, "return": [2, 6, 7, 8, 10, 16, 17, 20, 26, 34], "return_dict_in_gener": 34, "return_softmax": 2, "return_tensor": [6, 23], "reus": [2, 33], "review": [7, 34], "rf": 5, "rfc": 18, "rh": 17, "right": [7, 21, 23, 28], "risk": 34, "rm": 5, "rms_norm": [2, 34], "rmsnorm": [2, 28, 34], "rmsnorm_modul": 2, "rn50": [13, 34], "rn50_int8_jit": 32, "rn50_ipex_int8": 32, "rnn": 34, "rnncell": 15, "rnnt": [26, 34], "ro": 2, "roberta": [26, 34], "roialign": [7, 34], "role": 33, "root": [6, 13, 16, 17, 28], "rope": [28, 34], "rope_modul": 2, "rotari": [2, 28], "rotary_dim": 2, "rotary_embed": [2, 34], "rotary_half": 2, "rotary_ndim": 2, "rotaryembed": [2, 34], "roughli": 18, "round": [13, 21], "rounding_bia": 17, "row": 7, "rst": 5, "rule": [21, 34], "run": [2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 26, 30, 31, 32, 33, 34], "run_20210712212258_inst": 31, "run_20210712212258_instance_0_cores_0": 31, "run_20210712214504_inst": 31, "run_20210712214504_instance_0_cores_22": 31, "run_20210712220928_inst": 31, "run_20210712220928_instance_0_cores_0": 31, "run_20210712221150_inst": 31, "run_20210712221150_instance_0_cores_0": 31, "run_20210712221150_instance_1_cores_22": 31, "run_20210712221305_inst": 31, "run_20210712221305_instance_0_cores_0": 31, "run_20210712221305_instance_1_cores_11": 31, "run_20210712221305_instance_2_cores_22": 31, "run_20210712221305_instance_3_cores_33": 31, "run_20210712221415_inst": 31, "run_20210712221415_instance_0_cores_0": 31, "run_20210712221415_instance_10_cores_40": 31, "run_20210712221415_instance_1_cores_4": 31, "run_20210712221415_instance_2_cores_8": 31, "run_20210712221415_instance_3_cores_12": 31, "run_20210712221415_instance_4_cores_16": 31, "run_20210712221415_instance_5_cores_20": 31, "run_20210712221415_instance_6_cores_24": 31, "run_20210712221415_instance_7_cores_28": 31, "run_20210712221415_instance_8_cores_32": 31, "run_20210712221415_instance_9_cores_36": 31, "run_20210712221615_inst": 31, "run_20210712221615_instance_0_cores_11": 31, "run_20210712223308_inst": 31, "run_20210712223308_instance_0_cores_0": 31, "run_20210713152500_instance_0_cores_0": 31, "run_20210713153048_instance_0_cores_0": 31, "run_20210713153333_instance_0_cores_0": 31, "run_20210713153659_instance_0_cores_0": 31, "run_20220106130151_instance_0_cores_0": 31, "run_benchmark": [26, 34], "run_qa": [10, 34], "runner": 5, "running_mod": 34, "runtim": [1, 8, 13, 17, 31, 33, 34], "runtimeerror": [26, 34], "s1": 20, "s7": 34, "s8": 34, "sacrif": 8, "sai": 5, "salesforc": 28, "same": [2, 5, 7, 10, 15, 16, 17, 18, 20, 21, 28, 31, 32, 33, 34], "same_model_execution_again": 34, "sampl": [2, 6, 9, 14, 16, 17, 29, 33], "sample_input": [2, 9, 34], "sample_text_captum_input": 32, "sampler": 6, "sampling_s": [2, 4, 16, 34], "sapphir": 3, "satisfi": [15, 26], "satur": 34, "save": [2, 5, 6, 7, 13, 14, 15, 16, 18, 21, 28, 32, 34], "save_qconf_summari": [6, 15, 16, 29], "scalabl": [3, 7, 21, 28, 30, 33, 34], "scalar": 2, "scalartyp": 17, "scalartypetocpptyp": 17, "scale": [2, 3, 6, 15, 28], "scale_attn": 2, "scaled_dot_product_attent": 2, "scatter": 31, "scenario": [2, 6, 7, 18, 33, 34], "schedul": [1, 2, 13, 20, 31, 33], "scheme": 32, "scope": [2, 7, 8, 21, 34], "script": [1, 2, 3, 4, 5, 6, 7, 8, 10, 14, 17, 20, 23, 24, 26, 28, 29, 30, 32, 33, 34], "scriptmodul": [2, 13, 20], "sdk": 34, "search": [1, 2, 4, 5, 7, 16, 22, 28, 31], "sec": 30, "second": [2, 10, 28, 32, 33], "secondli": 28, "secret": 18, "section": [1, 6, 7, 8, 14, 20, 23, 24, 25, 28, 29, 32, 33, 34], "secur": 3, "see": [1, 2, 5, 8, 14, 34], "seed": 2, "seen": 28, "segment": 34, "select": [2, 5, 7, 13, 24, 34], "self": [2, 6, 8, 10, 16, 20, 26, 34], "selu": 34, "semant": 18, "sens": 21, "sep": [3, 17], "separ": [7, 19, 27, 33], "seq_classification_artifact": 32, "seq_info": 2, "seq_len": [2, 30], "seq_length": [6, 11, 32], "seqlen_k": 2, "seqlen_q": 2, "sequenc": [2, 18, 21, 28, 34], "sequenti": 16, "seri": 33, "serv": [20, 34], "server": [32, 33], "servic": [6, 28, 30, 33], "session": 30, "set": [1, 2, 4, 5, 6, 7, 8, 14, 15, 16, 17, 21, 24, 26, 28, 30, 31, 32, 33, 34], "set_flush_denorm": 33, "set_format": 6, "set_glob": 6, "set_num_thread": [26, 34], "set_properti": 6, "sete": 15, "settensorexprfuseren": 26, "setup": [5, 6, 28, 34], "setup_config": 32, "setup_lint": 5, "sever": [2, 7, 10, 19, 30, 31, 34], "sgd": [2, 6, 7, 8, 16, 19], "sgemm": 34, "sha": 17, "shall": [5, 18, 33], "shape": [2, 6, 7, 16, 20, 23, 30, 33, 34], "shard": 28, "share": [1, 5, 6, 16, 20, 32, 33, 34], "share_weight_observ": 2, "shared_criterion": [16, 22], "sheet": 23, "shift": 21, "ship": 28, "short_factor": 2, "shortcut": 34, "shorten": 5, "shorter": [21, 28], "should": [2, 5, 8, 15, 20, 28, 31, 33], "show": [8, 17, 21, 28, 29, 30, 31, 32, 33, 34], "shown": [1, 6, 18, 28, 31, 32], "shuffl": 6, "shufflenet": 30, "shufflenetv2_x1": 30, "side": [15, 33], "sigmoid": [13, 34], "sign": 21, "signficantli": 32, "signific": 21, "significantli": [28, 34], "silu": [2, 13], "similar": [15, 17, 33], "similarli": 32, "simpl": [5, 7, 8, 11, 18, 33, 34], "simplenet": [8, 34], "simpli": [6, 7, 26, 31], "simplifi": [10, 34], "simultan": 20, "sin": 2, "sinc": [6, 7, 18, 19, 20, 21, 26, 33, 34], "sincer": 34, "singl": [2, 7, 13, 14, 16, 19, 20, 30, 32, 34], "single_query_cached_kv_attent": 2, "site": 32, "situat": [7, 14], "six": 33, "size": [2, 6, 7, 11, 15, 16, 17, 18, 23, 26, 28, 30, 32, 33, 34], "sizeof": 17, "skip": [5, 6, 17, 18], "skip_special_token": [6, 23], "skylak": 15, "sleef": 17, "sleep": 33, "slice": [6, 18], "sliu": 34, "slope": 2, "slot": [2, 30], "slot_map": 2, "slow": 34, "slower": [8, 33, 34], "small": [7, 19, 33, 34], "smaller": [8, 17], "smooth": 7, "smooth_l1_loss": 8, "smoothquant": [2, 6, 7, 16, 22, 28, 34], "smoothquant_arg": [2, 16], "snc": 34, "snippet": [10, 29], "so": [2, 5, 6, 7, 8, 15, 17, 18, 20, 30, 31, 32, 33, 34], "sock": 32, "socket": [14, 30, 32, 33, 34], "soft_margin_loss": 8, "softmax": [2, 13, 34], "softmax_scal": 2, "softwar": [3, 27, 34], "sole": 33, "solut": [2, 7, 26, 28, 34], "solv": [7, 19, 33], "some": [2, 5, 7, 8, 13, 16, 17, 18, 20, 26, 28, 31, 32, 33, 34], "someth": 18, "sometim": [31, 33], "sophist": 33, "sourc": [1, 5, 6, 17, 27, 28, 33, 34], "space": [2, 7, 16, 18, 22, 33], "spars": [7, 18, 34], "sparsiti": 2, "spawn": [7, 20], "special": [17, 18, 28], "specif": [1, 2, 5, 6, 7, 12, 18, 20, 26, 28, 31, 33, 34], "specifi": [2, 5, 6, 14, 20, 31, 33, 34], "specifii": 17, "spectr": 30, "speech": [3, 33], "speed": [2, 7, 11, 19, 28, 33, 34], "speedup": [2, 6, 8, 28, 30, 34], "sphinx": 5, "split": [2, 6, 7, 16, 17, 19, 20, 26, 34], "split_bf16_from_fp32": 21, "split_master_weight_for_bf16": 2, "splitsgd": [7, 21], "spontan": 18, "sqrt": [2, 13, 19], "squad": [10, 30, 34], "squar": [13, 28], "squenc": 2, "src": [2, 17], "src_data_ptr": 18, "src_md": 18, "src_mem": 18, "ssd": [30, 34], "sse": 17, "sse2": 17, "sse3": 17, "sse4_1": 17, "sse4_2": 17, "ssse3": 17, "stabil": [2, 8, 34], "stabilityai": 28, "stabl": [2, 3, 8, 34], "stablelm": [2, 28], "stack": [6, 8], "stage": [7, 10, 19, 20, 29, 33, 34], "stakehold": 34, "stall": 33, "standard": [1, 34], "stanford": 34, "starcod": [28, 34], "start": [1, 3, 4, 5, 6, 7, 10, 20, 24, 34], "start_dim": 20, "state": [2, 15, 19, 28], "state_dict": [2, 6, 34], "state_sum": 19, "state_sum_i": 19, "state_sum_n": 19, "statement": [14, 17], "static": [2, 4, 16, 26, 28, 31, 32, 33, 34], "static_quantized_model": 6, "staticquantizationint8": 28, "statist": 7, "statu": 17, "std": [6, 17, 19], "stdio": 5, "stdout": 31, "stead": 17, "steam": [20, 34], "step": [2, 5, 6, 7, 8, 14, 16, 19, 21, 32], "step_siz": [16, 22], "stft": 8, "stick": 7, "still": [2, 5, 7, 8, 13, 16, 18, 21, 26, 34], "stock": [13, 30, 34], "stop": [2, 33], "storag": 19, "store": [2, 17, 18, 19, 21, 28, 31, 32, 33, 34], "store_tru": [6, 23], "str": [2, 6, 14, 23, 31], "straight": [13, 33], "straightforward": 34, "strategi": [14, 31, 33, 34], "stream": [2, 7, 20, 34], "streamlin": 34, "strict": [6, 32], "stride": [8, 10, 20, 34], "stride_c": 18, "stride_h": 18, "stride_n": 18, "stride_w": 18, "string": [2, 31], "structur": [1, 18, 31, 34], "style": [2, 5], "sub": [20, 28, 33], "subfold": 17, "subgraph": 2, "subject": [7, 17, 20, 27, 34], "submit": [1, 5, 7, 20], "submodul": 5, "subsequ": [18, 33], "substr": 5, "success": [10, 24], "suffer": 20, "suffix": 17, "suggest": [1, 2, 15, 18, 20, 33, 34], "suit": 5, "sum": [13, 16, 18, 19, 34], "summar": 26, "summari": [6, 34], "super": [8, 10, 16, 20, 26, 34], "superset": 20, "suppli": 8, "support": [2, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 25, 26, 28, 29, 31, 32, 33, 34], "suppos": [2, 6, 14, 33], "sure": [5, 14, 15, 32, 33], "svd": 8, "sw": 30, "swish": 34, "switch": [7, 17, 31, 33, 34], "sy": 30, "sycl": 1, "symbol": 20, "symeig": 8, "symlink": 5, "symmetr": [2, 15, 34], "sync": [5, 20], "synchron": [20, 26, 34], "sysctl": 33, "system": [17, 33], "systemat": 7, "t": [2, 5, 7, 8, 14, 15, 16, 17, 18, 20, 26, 32, 34], "t5": [2, 26, 28, 34], "t_valu": 17, "tab": 5, "tabl": [2, 7, 17, 28, 30, 34], "tackl": 7, "tacotron2": 34, "take": [1, 2, 7, 8, 10, 12, 13, 14, 18, 21, 25, 26, 30, 31, 33], "taken": 32, "tanh": [13, 34], "target": [5, 6, 10, 13, 14, 17, 34], "target_link_librari": 6, "target_v": 14, "task": [2, 7, 28, 31, 33, 34], "task1": 20, "task2": 20, "taskset": 31, "tbd": 26, "tc": 14, "tcmalloc": 32, "te": 34, "team": [1, 5], "techniqu": [1, 2, 7, 11, 12, 28, 34], "technolog": [1, 7, 28], "technologi": [3, 7], "tee": 31, "tell": [18, 20, 31, 33], "temperatur": [6, 23], "tenosr": 2, "tensor": [2, 6, 7, 8, 11, 15, 16, 17, 20, 26, 28, 32, 34], "tensorexpr_fus": 26, "tensorflow": 18, "tensoriter": 18, "terabyt": 30, "term": 27, "termin": 14, "test": [7, 16, 17, 30, 34], "test_": 5, "test_alias_analysi": 5, "test_bceloss": 5, "test_data": 16, "test_dataload": 16, "test_jit": 5, "test_mseloss": 5, "test_nn": 5, "test_sequenti": 5, "testclassnam": 5, "testjit": 5, "testnam": 5, "testnn": 5, "testsuit": 5, "text": [3, 6, 26, 28, 30, 33, 34], "text_max_length": 2, "tgi": 34, "than": [2, 5, 7, 17, 18, 20, 21, 26, 31, 33, 34], "thank": [5, 34], "thei": [7, 8, 31, 33], "them": [1, 5, 7, 18, 19, 28, 31, 33], "themselv": [31, 34], "therefor": 33, "thi": [2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31, 34], "thing": [14, 33], "third": [19, 34], "those": [2, 15, 33], "though": [2, 7], "thrash": 33, "threa": 34, "thread": [1, 2, 7, 20, 26, 30, 31, 32, 33, 34], "three": [7, 16, 17], "threshold": 33, "through": [1, 2, 6, 7, 8, 12, 25, 28, 33, 34], "throughput": [2, 3, 18, 20, 26, 28, 30, 34], "thu": [2, 7, 8, 10, 18, 20, 21, 28, 31, 32, 33], "thudm": 28, "tidi": 5, "tightli": 34, "tiiuae": 28, "tile": 17, "time": [2, 5, 7, 14, 16, 17, 18, 19, 26, 28, 30, 33, 34], "timeout": [2, 5, 21], "timestamp": [2, 28], "tip": 17, "tmp": [10, 32, 34], "to_bfloat16_train": 7, "to_dens": 18, "to_mkldnn": 18, "togeth": [7, 14, 20, 33, 34], "toggl": 7, "token": [2, 6, 23, 28, 30], "tokenize_funct": 6, "tolist": 16, "tool": [17, 33, 34], "toolset": 17, "top": [10, 21, 34], "top1": 30, "toplevel": 5, "topologi": [7, 18, 19, 26, 30, 31, 33, 34], "torch": [1, 2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 18, 20, 23, 26, 29, 32, 33, 34], "torch_ccl": 6, "torch_check": 17, "torch_dtyp": [6, 23], "torch_ipex": [17, 34], "torch_ipex_librari": 6, "torchconfig": 6, "torchdynamo": [1, 7, 12, 23, 34], "torchrun": 34, "torchscirpt": 2, "torchscript": [1, 2, 5, 7, 10, 11, 12, 19, 23, 26, 32, 34], "torchserv": [3, 34], "torchvis": [6, 10, 12, 13, 16, 18, 32, 34], "torchvison": 34, "total": [2, 6, 30, 33], "total_new_token": [6, 23], "totensor": [6, 13, 16], "tpp": 2, "trace": [1, 6, 7, 8, 12, 13, 15, 16, 20, 23, 26, 32, 34], "trace_model": 34, "traced_model": [6, 10, 13, 15, 16, 26, 34], "traced_model1": 20, "traced_model2": 20, "track": 1, "track_running_stat": 10, "trade": [8, 28, 30, 34], "tradeoff": 15, "trail": [5, 21], "train": [2, 3, 4, 7, 11, 13, 15, 16, 18, 21, 23, 26, 28, 31, 34], "train_dataload": 16, "train_dataset": [6, 13], "train_load": [6, 8], "training_data": 16, "transfer": 33, "transform": [2, 3, 4, 6, 10, 11, 13, 16, 18, 22, 23, 28, 29, 32, 33, 34], "transformer_handler_gener": 32, "transformerencoderlay": 26, "transnetv2": 34, "transpar": [2, 7, 29, 33, 34], "transpos": [13, 34], "tree": [5, 6], "tri": 12, "trial": 14, "triangular_solv": 8, "trigger": 12, "triplet_margin_loss": 8, "true": [2, 4, 6, 10, 12, 13, 14, 15, 16, 17, 22, 23, 31, 32, 33, 34], "trust_remote_cod": [6, 23], "truth": 21, "try": [2, 5, 6, 7, 12, 14, 16, 26, 31, 33, 34], "tunabl": [30, 32], "tune": [2, 3, 4, 7, 8, 15, 20, 26, 28, 29, 31, 32, 34], "tuned_conf": 16, "tuned_model": [4, 16, 34], "tunin": 32, "tuning_tim": [2, 4, 16, 34], "tupl": [2, 6, 17, 20], "turboboost": 30, "turn": [7, 34], "tutori": [5, 6, 15, 16, 29, 34], "two": [2, 7, 14, 16, 20, 21, 28, 32, 33, 34], "txt": [5, 6, 32], "type": [2, 4, 5, 6, 7, 10, 16, 17, 18, 20, 21, 23, 30, 31, 32, 34], "types": 17, "typic": [6, 10, 28, 33, 34], "u": [30, 32], "u7": 34, "u8": 34, "ubuntu": 30, "ucod": 30, "uint32_t": 17, "ultra": 33, "uma": 33, "unabl": 10, "unalign": [17, 34], "uncas": [4, 6, 10, 11, 32, 34], "undefin": [2, 20, 33], "under": [2, 6, 8, 18, 20, 27, 31, 34], "undergo": 26, "underhood": 34, "underli": [1, 17, 28], "underneath": 34, "understand": [21, 28, 33], "undesir": 31, "unexpect": 2, "unifi": [2, 31], "uniform": 32, "uninstal": 5, "union": 2, "unit": [1, 2, 33], "unittest": 5, "unix": 32, "unlik": 6, "unlist": 8, "unnecessari": 33, "unpack": [26, 34], "unpad": 2, "unpredict": 2, "unrel": 6, "unsign": 34, "unsqueez": 2, "unstabl": 8, "until": [5, 20, 21, 33], "untrack": 5, "unus": [31, 33], "unutil": 32, "up": [2, 3, 7, 11, 20, 24, 28, 33, 34], "updat": [2, 5, 7, 16, 19, 21, 22, 34], "upgrad": 34, "upi": 33, "upload": 34, "upper": [18, 33], "upsampl": [18, 34], "upstream": [7, 18, 34], "url": [32, 34], "us": [1, 2, 3, 4, 5, 6, 11, 14, 15, 17, 18, 19, 21, 23, 24, 25, 26, 27, 28, 32, 33, 34], "usabl": 34, "usag": [2, 6, 7, 8, 23, 25, 32, 33, 34], "use_all_nod": 14, "use_default_alloc": [32, 34], "use_logical_cor": [14, 32], "user": [1, 2, 7, 9, 10, 12, 13, 15, 16, 18, 20, 26, 31, 32, 33, 34], "user_model": [6, 15], "usr": [6, 17, 31, 32], "usual": [2, 18, 20, 33], "usuali": 33, "usus": 32, "ut": 31, "util": [1, 6, 7, 10, 13, 15, 16, 18, 21, 28, 31, 33, 34], "ux": 34, "v": 5, "v0": [28, 34], "v1": [28, 34], "v2": [28, 30, 34], "v3": 34, "valid": [2, 21, 34], "valu": [2, 6, 10, 14, 16, 17, 19, 20, 21, 22, 26, 28, 31, 32, 33, 34], "value_cach": 2, "value_token": 2, "var": 29, "vari": 16, "variabl": [2, 5, 17, 30, 31, 32, 33, 34], "varianc": 34, "variance_epsilon": 2, "variant": [2, 8, 28, 34], "variou": [6, 7, 14, 28, 33, 34], "varlen_attent": [2, 34], "varlenattent": [2, 34], "varlenattention_modul": 2, "ve": 34, "vec256": 17, "vec512": 17, "vec_bia": 17, "vector": [1, 2, 6, 17, 18, 25, 28], "vectors": 17, "verbos": [2, 4, 31], "verbose_off": 2, "verbose_on": 2, "verbose_on_cr": 2, "veri": [2, 5, 15, 18, 28], "verifi": [6, 7], "version": [6, 7, 16, 17, 25, 26, 27, 32, 33, 34], "vgg": 30, "vgg11": 30, "via": [2, 5, 6, 7, 18, 20, 30, 31, 33, 34], "video": 7, "view": [13, 18, 20, 21], "view_as_complex": 8, "virtual": 17, "virtual_env": [31, 32], "vision": [3, 6, 28, 30, 34], "visit": [7, 33], "vllm": [2, 34], "vm": 34, "vnni": [1, 15, 17, 25, 28], "vocab_s": [6, 11, 32], "voic": 33, "void": 17, "vstack": 6, "w": [7, 16, 18, 21, 30, 32], "wa": [7, 31, 32, 33, 34], "wai": [5, 10, 16, 18, 28, 34], "wait": [20, 33], "wake": 20, "walk": 34, "want": [2, 5, 7, 14, 15, 17, 20, 31, 34], "warm": 33, "warn": [5, 6, 12, 31, 32, 34], "wav2vec2": 33, "wave2vec": 34, "wc": 18, "we": [1, 2, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 23, 28, 30, 32, 33, 34], "web": 28, "webpag": 34, "websit": 7, "wei_ic_observ": 2, "wei_observ": 2, "weight": [1, 2, 7, 10, 12, 13, 15, 16, 18, 20, 22, 23, 26, 28, 34], "weight_dacai": 21, "weight_decai": [7, 19], "weight_dtyp": [2, 6, 29], "weight_qschem": 2, "weights_prepack": [2, 6, 7, 23, 26], "well": [1, 2, 5, 6, 7, 11, 16, 20, 21, 24, 28, 32, 33, 34], "were": [30, 31, 32, 33], "west": 30, "what": [3, 5, 6, 8, 23], "wheel": 34, "when": [2, 5, 6, 7, 8, 9, 14, 18, 19, 20, 21, 22, 25, 26, 28, 30, 31, 32, 33, 34], "where": [2, 5, 7, 16, 21, 33, 34], "wherea": 30, "whether": [2, 6, 8, 16, 18, 22, 23, 33], "which": [1, 2, 5, 7, 8, 10, 14, 15, 16, 17, 18, 20, 26, 28, 30, 31, 32, 33, 34], "while": [2, 7, 8, 11, 12, 18, 21, 26, 28, 31, 32, 33, 34], "whisper": [2, 28, 34], "whl": 34, "who": 10, "whole": [19, 20, 33], "wide": [21, 34], "wider": 1, "widespread": [1, 7, 28], "width": [17, 18], "wikipedia": [13, 33], "wise": [2, 16, 19, 22, 29, 34], "wish": [5, 7], "with_arg": [2, 6, 15], "within": [5, 16, 21, 29, 33, 34], "without": [2, 5, 6, 7, 8, 10, 16, 20, 21, 26, 32, 34], "wlydcrb1": 30, "wn": 18, "won": [2, 7, 8, 17, 26], "woq": [2, 28], "woqactquantmod": 2, "woqlowpmod": [2, 6, 29], "woqweightdtyp": [2, 6, 29], "woqweightqschem": 2, "work": [2, 5, 6, 7, 14, 15, 17, 20, 26, 28, 29, 31, 33, 34], "workabl": 2, "workaround": [26, 34], "worker": [20, 31], "workflow": 34, "workload": [1, 6, 7, 8, 10, 11, 12, 21, 26, 28, 29, 30, 31, 33, 34], "workload1": 30, "workspac": 6, "world": [5, 7], "world_siz": [6, 29], "worri": 32, "wors": 2, "worth": 34, "would": [2, 5, 6, 14, 16, 17, 18, 30, 31, 32, 33, 34], "wrap": 34, "write": [7, 17], "written": [5, 6, 17], "x": [1, 2, 5, 6, 8, 10, 13, 15, 16, 17, 18, 20, 21, 23, 26, 34], "x1": 20, "x2": 20, "x86": 3, "x86_64": 30, "xcr0": 17, "xdf": 5, "xe": 33, "xeon": [3, 7, 14, 21, 28, 30, 32, 33, 34], "xl": 28, "xlm": 26, "xmx": 1, "xpu": [1, 2, 3, 34], "xsave": 17, "xx": 6, "xx_c": 34, "xx_v": 34, "y": [8, 15, 16, 20, 21, 34], "y1": 20, "y1_futur": 20, "y2": 20, "y2_futur": 20, "y_runtim": 20, "yaml": 14, "ye": 5, "year": 28, "yet": [2, 6, 26, 34], "yield": [1, 7, 33], "yolov3": 34, "you": [1, 2, 5, 6, 7, 8, 13, 14, 15, 17, 18, 20, 23, 25, 26, 28, 29, 31, 33, 34], "your": [1, 5, 6, 7, 8, 10, 14, 15, 20, 23, 24, 26, 27, 28, 29, 34], "your_calibration_dataset": 29, "your_conf_fil": [4, 34], "your_generation_param": 34, "your_python_script": [4, 34], "your_pytorch_script": [4, 31], "yuan": [2, 28], "yuan2": 28, "z11pa": 33, "zero": [6, 15, 34], "zero_grad": [6, 7, 16], "zero_tensor": 2, "zip": [6, 23, 34], "zone": [30, 34], "zoo": [6, 30], "\u03b1": 21}, "titles": ["Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc", "Intel\u00ae Extension for PyTorch*", "API Documentation", "Blogs & Publications", "Cheat Sheet", "Contribution", "Examples", "Features", "Auto Mixed Precision (AMP)", "Auto Channels Last", "Codeless Optimization (Prototype)", "Fast BERT (Prototype)", "Graph Capture (Prototype)", "Graph Optimization", "HyperTune (Prototype)", "Intel\u00ae Extension for PyTorch* optimizations for quantization", "INT8 Recipe Tuning API (Prototype)", "ISA Dynamic Dispatching", "Channels Last", "Optimizer Fusion", "Runtime Extension", "Split SGD", "Smooth Quant Recipe Tuning API (Prototype)", "Quick Start", "Installation", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimization Overview", "LLM Optimizations Frontend API", "Performance", "Launch Script Usage Guide", "TorchServe with Intel\u00ae Extension for PyTorch*", "Performance Tuning Guide", "Releases"], "titleterms": {"": 34, "0": [6, 7, 34], "1": [7, 14, 32, 34], "10": [30, 34], "100": 34, "11": [30, 34], "12": 34, "13": [7, 34], "2": [6, 7, 14, 32, 34], "200": [30, 34], "2xlarg": 30, "3": [32, 34], "300": 34, "4": [32, 34], "5": 34, "8": 34, "9": 34, "That": 18, "The": 10, "__call__": 10, "access": [28, 33], "accuraci": 30, "add": 17, "ai": [6, 30], "algorithm": 16, "all": [18, 31], "alloc": [31, 33], "alpha": [16, 34], "alreadi": 10, "amp": [7, 8], "an": 30, "api": [2, 7, 9, 13, 16, 17, 18, 22, 25, 28, 29], "appli": 10, "architectur": 1, "archiv": 32, "asynchron": 20, "aten": [17, 18], "attr": 10, "auto": [7, 8, 9, 16, 20], "autocast": 8, "autotun": 16, "aw": 30, "b": 18, "basic": 20, "behavior": 8, "benchmark": 32, "bert": [2, 6, 7, 11, 32], "beta": [6, 7], "better": 5, "bf16": [6, 10, 13, 29], "bfloat16": [6, 8, 21, 26, 30], "bind": 20, "block": 18, "blog": 3, "boost": 32, "build": [5, 17], "c": [5, 6, 18], "c6i": 30, "cach": [28, 33], "calibr": [6, 15], "can": 8, "captur": [7, 12], "case": [8, 10, 20], "center": 30, "chang": 34, "channel": [7, 9, 18, 33], "cheat": 4, "check": 17, "code": 17, "codegen": 17, "codeless": [7, 10], "command": 10, "common": 29, "compil": [7, 17], "configur": [20, 30, 33], "content": [32, 33], "contribut": 5, "convers": 18, "convert": 15, "convolut": 18, "core": [20, 31, 32], "correct": 26, "coverag": 18, "cpp": 17, "cpu": [0, 2, 17, 18, 33], "creat": [18, 32], "creation": 18, "csrc": 17, "custom": [17, 28], "d": 18, "data": [28, 30], "debug": [5, 17], "deepspe": [28, 29], "default": [8, 9, 14, 18, 31], "defin": [14, 15], "demo": 28, "denorm": 33, "deploi": [15, 32], "deploy": 6, "descent": 21, "descript": [11, 12], "design": [0, 17, 20, 31], "detail": 20, "determin": 16, "develop": 5, "disabl": 9, "dispatch": [0, 7, 17], "dispatchstub": 17, "distribut": [6, 28, 29], "do": 15, "doc": 0, "document": [2, 5, 25, 32, 33], "dure": 20, "dynam": [0, 6, 7, 15, 17, 26], "dyndisp": 17, "eager": [6, 8], "eas": [9, 13], "easi": 7, "ec2": 30, "elig": 8, "enabl": 9, "exampl": [6, 10, 11, 12, 14, 16, 17, 20, 31], "examples1": 20, "examples2": 20, "examples3": 20, "explicitli": 10, "export": 32, "extens": [0, 1, 5, 7, 15, 20, 26, 32], "fast": [2, 6, 7, 11], "featur": [6, 7, 11, 12, 17], "file": 32, "fix": 16, "float32": [6, 8], "fold": 13, "folder": 17, "format": 18, "forward": 10, "fp32": [6, 10, 13, 29, 30], "from": [6, 7], "frontend": 29, "fusion": [13, 19], "gener": [2, 26], "get": 25, "gnu": [31, 33], "gradient": 21, "graph": [2, 7, 12, 13, 28], "guid": [31, 33], "h": 17, "hardwar": [30, 33], "highlight": 34, "how": 20, "huggingfac": 10, "hyperparamet": 14, "hypertun": [7, 14], "i": [18, 20, 31], "ii": 31, "iii": 31, "implement": [17, 20], "improv": 34, "includ": 31, "index": 31, "indirect": 28, "infer": [6, 8, 28, 29, 31, 32], "input": [8, 20], "instal": [24, 32], "instanc": [28, 30, 31], "instead": 10, "int4": 6, "int8": [6, 7, 13, 16, 26, 30, 32], "intel": [0, 1, 5, 6, 15, 30, 31, 32, 33], "intrin": 17, "introduct": [8, 19, 25], "iomp": 20, "ipex": [10, 28], "isa": [0, 7, 17], "issu": [9, 20, 34], "iv": 31, "jemalloc": [31, 33], "jit": 10, "kernel": [17, 18], "known": [9, 20, 34], "kv": 28, "languag": [6, 7, 28], "larg": [6, 7, 28], "last": [7, 9, 18, 33], "latenc": 31, "launch": [10, 31], "launcher": [14, 32], "layout": 18, "level": [2, 17, 28], "librari": 31, "licens": 27, "linear": 28, "lint": 5, "list": 28, "llm": [2, 6, 7, 23, 28, 29, 30], "load": 20, "local": 5, "logic": 31, "low": 28, "manner": 18, "manual": 17, "matter": 18, "memori": [18, 31, 33], "method": 10, "methodologi": [13, 28], "mix": [7, 8], "mode": [6, 28, 31], "model": [6, 7, 13, 15, 18, 20, 28, 32], "modul": [2, 10, 20, 28], "motiv": 10, "multi": 32, "multipl": 31, "multistream": 20, "nativ": 18, "nchw": 18, "nchw16c": 18, "new": [6, 7, 34], "nhwc": 18, "node": 31, "non": 33, "note": 34, "numa": 33, "numactl": 33, "number": [30, 31, 33], "omp_num_thread": 33, "omp_thread_limit": 33, "onednn": [18, 33], "onli": [6, 29], "op": 8, "openmp": [31, 33], "oper": [7, 18, 19, 28], "optim": [2, 7, 10, 13, 15, 19, 28, 29], "origin": 10, "other": 34, "output": 20, "overview": [17, 28, 30, 31, 33], "path": 8, "pattern": 13, "perform": [20, 26, 30, 32, 33, 34], "physic": 31, "pin": 32, "precis": [7, 8, 28], "preload": 20, "prepar": 15, "prerequisit": 11, "primit": [18, 33], "privat": 17, "process": 17, "product": 30, "promot": 8, "prototyp": [2, 6, 7, 10, 11, 12, 14, 16, 22, 28], "pseudocod": 29, "public": 3, "pytest": 5, "python": [5, 6, 7], "pytorch": [0, 1, 5, 15, 18, 32], "qconfig": 15, "quant": 22, "quantiz": [2, 6, 7, 15, 16, 29], "quick": 23, "recip": [16, 20, 22], "refer": [6, 8], "regist": [18, 32], "regress": 26, "releas": 34, "requir": [17, 20], "resnet50": [6, 32], "result": [26, 34], "runtim": [2, 7, 20, 26], "scale": 32, "scenario": 29, "script": 31, "search": 14, "select": 17, "serial": 32, "serv": 32, "set": 20, "sgd": 21, "shape": 26, "sheet": 4, "singl": [28, 31], "smooth": [6, 16, 22], "smoothquant": 29, "softwar": [30, 33], "space": 14, "specif": [8, 17], "split": 21, "start": [23, 25, 32], "static": [6, 15], "statu": 18, "stochast": 21, "stride": 18, "struct": 17, "structur": [20, 33], "stub": 17, "support": [1, 8, 10], "target": 18, "task": 20, "tcmalloc": [31, 33], "tensor": 18, "test": 5, "thi": [32, 33], "through": 16, "throughput": 31, "tip": 5, "torch": 7, "torchdynamo": [6, 26], "torchscript": [6, 8], "torchserv": 32, "trace": 10, "train": [6, 8], "troubleshoot": 26, "tune": [14, 16, 22, 33], "type": [8, 28], "uniform": 33, "unit": 5, "us": [7, 8, 9, 10, 13, 16, 20, 31], "usag": [10, 11, 12, 14, 16, 20, 26, 29, 31], "user": 14, "v": 31, "v1": 30, "vec": 17, "verifi": 28, "version": 30, "vi": 31, "via": 28, "vii": 31, "viii": 31, "weight": [6, 29], "what": [18, 34], "widest": 8, "wip": 18, "woq": 29, "worker": 32, "write": [5, 18], "xyz": 17, "xyzkrnl": 17, "your": 31, "your_conf_fil": 14, "your_python_script": 14}}) \ No newline at end of file diff --git a/cpu/2.5.0+cpu/tutorials/api_doc.html b/cpu/2.5.0+cpu/tutorials/api_doc.html index 2f56f6335..81944c4d8 100644 --- a/cpu/2.5.0+cpu/tutorials/api_doc.html +++ b/cpu/2.5.0+cpu/tutorials/api_doc.html @@ -1751,7 +1751,7 @@

Graph OptimizationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/blogs_publications.html b/cpu/2.5.0+cpu/tutorials/blogs_publications.html index 099e42d92..c20682bfd 100644 --- a/cpu/2.5.0+cpu/tutorials/blogs_publications.html +++ b/cpu/2.5.0+cpu/tutorials/blogs_publications.html @@ -167,7 +167,7 @@

Blogs & PublicationsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/cheat_sheet.html b/cpu/2.5.0+cpu/tutorials/cheat_sheet.html index 3551797d4..24b2ee3bf 100644 --- a/cpu/2.5.0+cpu/tutorials/cheat_sheet.html +++ b/cpu/2.5.0+cpu/tutorials/cheat_sheet.html @@ -195,7 +195,7 @@

Cheat SheetSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/contribution.html b/cpu/2.5.0+cpu/tutorials/contribution.html index 9c4077718..9c1b5789e 100644 --- a/cpu/2.5.0+cpu/tutorials/contribution.html +++ b/cpu/2.5.0+cpu/tutorials/contribution.html @@ -331,7 +331,7 @@

Tips< Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/examples.html b/cpu/2.5.0+cpu/tutorials/examples.html index cf225776b..8f595e425 100644 --- a/cpu/2.5.0+cpu/tutorials/examples.html +++ b/cpu/2.5.0+cpu/tutorials/examples.html @@ -636,7 +636,7 @@

Fast Bert (Prototype)
import torch
 from transformers import BertModel
 
-model = BertModel.from_pretrained("bert-base-uncased")
+model = BertModel.from_pretrained("bert-base-uncased", attn_implementation="eager")
 model.eval()
 
 vocab_size = model.config.vocab_size
@@ -1455,7 +1455,7 @@ 

Intel® AI Reference ModelsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features.html b/cpu/2.5.0+cpu/tutorials/features.html index 95d044081..ba89bfe2c 100644 --- a/cpu/2.5.0+cpu/tutorials/features.html +++ b/cpu/2.5.0+cpu/tutorials/features.html @@ -440,7 +440,7 @@

Fast BERT Optimization (Prototype, NEW feature from 2.0.0)Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/amp.html b/cpu/2.5.0+cpu/tutorials/features/amp.html index 07fd70fd3..6328a3f82 100644 --- a/cpu/2.5.0+cpu/tutorials/features/amp.html +++ b/cpu/2.5.0+cpu/tutorials/features/amp.html @@ -262,7 +262,7 @@

Ops that promote to the widest input typeSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/auto_channels_last.html b/cpu/2.5.0+cpu/tutorials/features/auto_channels_last.html index de335adfc..915851e3b 100644 --- a/cpu/2.5.0+cpu/tutorials/features/auto_channels_last.html +++ b/cpu/2.5.0+cpu/tutorials/features/auto_channels_last.html @@ -192,7 +192,7 @@

Known issueSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/codeless_optimization.html b/cpu/2.5.0+cpu/tutorials/features/codeless_optimization.html index fac1c5810..22c6f3bf8 100644 --- a/cpu/2.5.0+cpu/tutorials/features/codeless_optimization.html +++ b/cpu/2.5.0+cpu/tutorials/features/codeless_optimization.html @@ -280,7 +280,7 @@

Already using Jit TraceSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/fast_bert.html b/cpu/2.5.0+cpu/tutorials/features/fast_bert.html index e90c4339d..763e6afa2 100644 --- a/cpu/2.5.0+cpu/tutorials/features/fast_bert.html +++ b/cpu/2.5.0+cpu/tutorials/features/fast_bert.html @@ -144,7 +144,7 @@

Feature Description

Prerequisite

    -
  • Transformers 4.6.0 ~ 4.43.2

  • +
  • Transformers 4.6.0 ~ 4.45.0

@@ -153,7 +153,7 @@

Usage Example
import torch
 from transformers import BertModel
 
-model = BertModel.from_pretrained("bert-base-uncased")
+model = BertModel.from_pretrained("bert-base-uncased", attn_implementation="eager")
 model.eval()
 
 vocab_size = model.config.vocab_size
@@ -194,7 +194,7 @@ 

Usage ExampleSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/graph_capture.html b/cpu/2.5.0+cpu/tutorials/features/graph_capture.html index 06ddd1bfe..504b9cb2d 100644 --- a/cpu/2.5.0+cpu/tutorials/features/graph_capture.html +++ b/cpu/2.5.0+cpu/tutorials/features/graph_capture.html @@ -180,7 +180,7 @@

Usage ExampleSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/graph_optimization.html b/cpu/2.5.0+cpu/tutorials/features/graph_optimization.html index 240c613fe..dc20c83af 100644 --- a/cpu/2.5.0+cpu/tutorials/features/graph_optimization.html +++ b/cpu/2.5.0+cpu/tutorials/features/graph_optimization.html @@ -392,7 +392,7 @@

Folding Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/hypertune.html b/cpu/2.5.0+cpu/tutorials/features/hypertune.html index 03a8ec635..a2b73436b 100644 --- a/cpu/2.5.0+cpu/tutorials/features/hypertune.html +++ b/cpu/2.5.0+cpu/tutorials/features/hypertune.html @@ -330,7 +330,7 @@

Usage ExamplesSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/int8_overview.html b/cpu/2.5.0+cpu/tutorials/features/int8_overview.html index da219b2e2..9fb177830 100644 --- a/cpu/2.5.0+cpu/tutorials/features/int8_overview.html +++ b/cpu/2.5.0+cpu/tutorials/features/int8_overview.html @@ -300,7 +300,7 @@

Convert to Dynamic Quantized Model and DeploySphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/int8_recipe_tuning_api.html b/cpu/2.5.0+cpu/tutorials/features/int8_recipe_tuning_api.html index 5b47f44b1..60fbdb256 100644 --- a/cpu/2.5.0+cpu/tutorials/features/int8_recipe_tuning_api.html +++ b/cpu/2.5.0+cpu/tutorials/features/int8_recipe_tuning_api.html @@ -390,7 +390,7 @@

Determining the Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/isa_dynamic_dispatch.html b/cpu/2.5.0+cpu/tutorials/features/isa_dynamic_dispatch.html index 48e126072..e635c8da5 100644 --- a/cpu/2.5.0+cpu/tutorials/features/isa_dynamic_dispatch.html +++ b/cpu/2.5.0+cpu/tutorials/features/isa_dynamic_dispatch.html @@ -742,7 +742,7 @@

CPU feature checkSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/nhwc.html b/cpu/2.5.0+cpu/tutorials/features/nhwc.html index cbbff7a5f..87c5762af 100644 --- a/cpu/2.5.0+cpu/tutorials/features/nhwc.html +++ b/cpu/2.5.0+cpu/tutorials/features/nhwc.html @@ -370,7 +370,7 @@

CPU Channels Last TargetsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/optimizer_fusion.html b/cpu/2.5.0+cpu/tutorials/features/optimizer_fusion.html index b16dfd160..e79a8474c 100644 --- a/cpu/2.5.0+cpu/tutorials/features/optimizer_fusion.html +++ b/cpu/2.5.0+cpu/tutorials/features/optimizer_fusion.html @@ -184,7 +184,7 @@

Operation FusionSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/runtime_extension.html b/cpu/2.5.0+cpu/tutorials/features/runtime_extension.html index e9666485a..ad7f47d41 100644 --- a/cpu/2.5.0+cpu/tutorials/features/runtime_extension.html +++ b/cpu/2.5.0+cpu/tutorials/features/runtime_extension.html @@ -347,7 +347,7 @@

IOMP preload or load during the runtimeSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/split_sgd.html b/cpu/2.5.0+cpu/tutorials/features/split_sgd.html index c2eb39803..4db0b9352 100644 --- a/cpu/2.5.0+cpu/tutorials/features/split_sgd.html +++ b/cpu/2.5.0+cpu/tutorials/features/split_sgd.html @@ -220,7 +220,7 @@

Split SGD< Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/features/sq_recipe_tuning_api.html b/cpu/2.5.0+cpu/tutorials/features/sq_recipe_tuning_api.html index 1a88cdca5..99867857b 100644 --- a/cpu/2.5.0+cpu/tutorials/features/sq_recipe_tuning_api.html +++ b/cpu/2.5.0+cpu/tutorials/features/sq_recipe_tuning_api.html @@ -209,7 +209,7 @@

Smooth Quant Recipe Tuning API (Prototype)Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/getting_started.html b/cpu/2.5.0+cpu/tutorials/getting_started.html index d80e6400a..8e8e85197 100644 --- a/cpu/2.5.0+cpu/tutorials/getting_started.html +++ b/cpu/2.5.0+cpu/tutorials/getting_started.html @@ -113,7 +113,7 @@

Quick Start

-

The following instructions assume you have installed the Intel® Extension for PyTorch*. For installation instructions, refer to Installation.

+

The following instructions assume you have installed the Intel® Extension for PyTorch*. For installation instructions, refer to Installation.

To start using the Intel® Extension for PyTorch* in your code, you need to make the following changes:

  1. Import the extension with import intel_extension_for_pytorch as ipex.

  2. @@ -261,7 +261,7 @@

    LLM Quick Startprint(gen_text, total_new_tokens, flush=True)

-

More LLM examples, including usage of low precision data types are available in the LLM Examples section.

+

More LLM examples, including usage of low precision data types are available in the LLM Examples section.

@@ -282,7 +282,7 @@

LLM Quick StartSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/installation.html b/cpu/2.5.0+cpu/tutorials/installation.html index eef9efb3a..bf139131e 100644 --- a/cpu/2.5.0+cpu/tutorials/installation.html +++ b/cpu/2.5.0+cpu/tutorials/installation.html @@ -110,9 +110,9 @@

Installation

-

Select your preferences and follow the installation instructions provided on the Installation page.

+

Select your preferences and follow the installation instructions provided on the Installation page.

After successful installation, refer to the Quick Start and Examples sections to start using the extension in your code.

-

NOTE: For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the LLM best practices.

+

NOTE: For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the LLM best practices.

@@ -132,7 +132,7 @@

InstallationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/introduction.html b/cpu/2.5.0+cpu/tutorials/introduction.html index d33171992..27607af3d 100644 --- a/cpu/2.5.0+cpu/tutorials/introduction.html +++ b/cpu/2.5.0+cpu/tutorials/introduction.html @@ -128,7 +128,7 @@

Introduction

Get Started

@@ -156,7 +156,7 @@

API DocumentationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/known_issues.html b/cpu/2.5.0+cpu/tutorials/known_issues.html index d1c3b186f..bd15b60c6 100644 --- a/cpu/2.5.0+cpu/tutorials/known_issues.html +++ b/cpu/2.5.0+cpu/tutorials/known_issues.html @@ -316,7 +316,7 @@

Result CorrectnessSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/license.html b/cpu/2.5.0+cpu/tutorials/license.html index be0f43618..cf3a81ca8 100644 --- a/cpu/2.5.0+cpu/tutorials/license.html +++ b/cpu/2.5.0+cpu/tutorials/license.html @@ -132,7 +132,7 @@

License Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/llm.html b/cpu/2.5.0+cpu/tutorials/llm.html index 0f18ce717..d1d88c340 100644 --- a/cpu/2.5.0+cpu/tutorials/llm.html +++ b/cpu/2.5.0+cpu/tutorials/llm.html @@ -156,9 +156,9 @@

Verified for single instance mode🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

LLAMA

@@ -183,16 +183,16 @@

Verified for single instance mode🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

LLAMA

meta-llama/Meta-Llama-3-70B

🟩

🟩

-

🟨

+

🟩

🟩

🟩

@@ -201,10 +201,28 @@

Verified for single instance mode🟩

🟩

-

🟨

+

🟩

🟩

🟩

+ +

LLAMA

+

meta-llama/Llama-3.2-3B-Instruct

+

🟩

+

🟩

+

🟩

+

🟩

+

🟩

+ + +

LLAMA

+

meta-llama/Llama-3.2-11B-Vision-Instruct

+

🟩

+

🟩

+

+

🟩

+

+

GPT-J

EleutherAI/gpt-j-6b

@@ -218,19 +236,19 @@

Verified for single instance mode🟩

-

🟨

-

🟨

🟩

-

🟨

+

🟩

+

🟩

+

🟩

DOLLY

databricks/dolly-v2-12b

🟩

-

🟨

-

🟨

🟩

-

🟨

+

🟩

+

🟩

+

🟩

FALCON

@@ -239,7 +257,7 @@

Verified for single instance mode🟩

🟩

🟩

-

+

🟩

FALCON

@@ -248,7 +266,7 @@

Verified for single instance mode🟩

🟩

🟩

-

🟨

+

🟩

FALCON

@@ -266,7 +284,7 @@

Verified for single instance mode🟩

🟩

🟩

-

🟨

+

🟩

OPT

@@ -275,16 +293,16 @@

Verified for single instance mode🟩

🟩

🟩

-

🟨

+

🟩

Bloom

bigscience/bloom-1b7

🟩

-

🟨

🟩

🟩

-

🟨

+

🟩

+

🟩

CodeGen

@@ -302,34 +320,34 @@

Verified for single instance mode🟩

🟩

🟩

-

🟨

+

🟩

Baichuan

baichuan-inc/Baichuan2-13B-Chat

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Baichuan

baichuan-inc/Baichuan-13B-Chat

🟩

-

🟨

🟩

🟩

-

🟨

+

🟩

+

🟩

ChatGLM

THUDM/chatglm3-6b

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

ChatGLM

@@ -338,23 +356,23 @@

Verified for single instance mode🟩

🟩

🟩

-

🟨

+

🟩

GPTBigCode

bigcode/starcoder

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

T5

google/flan-t5-xl

🟩

🟩

-

🟨

+

🟩

🟩

@@ -372,9 +390,9 @@

Verified for single instance mode🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Mixtral

@@ -383,34 +401,34 @@

Verified for single instance mode🟩

🟩

-

🟨

+

🟩

Stablelm

stabilityai/stablelm-2-1_6b

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Qwen

Qwen/Qwen-7B-Chat

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Qwen

Qwen/Qwen2-7B

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

LLaVA

@@ -436,7 +454,7 @@

Verified for single instance mode🟩

🟩

-

🟨

+

🟩

@@ -446,43 +464,43 @@

Verified for single instance mode🟩

🟩

🟩

-

🟨

+

🟩

Phi

microsoft/Phi-3-mini-4k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Phi

microsoft/Phi-3-mini-128k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Phi

microsoft/Phi-3-medium-4k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Phi

microsoft/Phi-3-medium-128k-instruct

🟩

🟩

-

🟨

🟩

-

🟨

+

🟩

+

🟩

Whisper

@@ -494,11 +512,7 @@

Verified for single instance mode

- -
    -
  • 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

  • -
  • 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

  • -
+

Verified for distributed inference mode via DeepSpeed

@@ -547,6 +561,18 @@

Verified for distributed inference mode via DeepSpeed

🟩

+ + + + + + + + + + + + @@ -556,13 +582,13 @@

Verified for distributed inference mode via DeepSpeed

- + - + @@ -580,7 +606,7 @@

Verified for distributed inference mode via DeepSpeed

- + @@ -592,7 +618,7 @@

Verified for distributed inference mode via DeepSpeed

- + @@ -616,7 +642,7 @@

Verified for distributed inference mode via DeepSpeed

- + @@ -710,11 +736,7 @@

Verified for distributed inference mode via DeepSpeed

🟩

-

🟩

LLAMA

meta-llama/Llama-3.2-3B-Instruct

🟩

🟩

LLAMA

meta-llama/Llama-3.2-11B-Vision-Instruct

🟩

🟩

GPT-J

EleutherAI/gpt-j-6b

GPT-NEOX

EleutherAI/gpt-neox-20b

🟨

🟩

🟩

DOLLY

databricks/dolly-v2-12b

🟨

🟩

🟩

OPT

facebook/opt-30b

🟨

🟩

🟩

Bloom

bigscience/bloom-1b7

🟨

🟩

🟩

Baichuan

baichuan-inc/Baichuan-13B-Chat

🟨

🟩

🟩

-
    -
  • 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

  • -
  • 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

  • -

Note: The above verified models (including other models in the same model family, like “codellama/CodeLlama-7b-hf” from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and customized linear kernels. We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future.

+

Note: The above verified models (including other models in the same model family, like “codellama/CodeLlama-7b-hf” from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and customized linear kernels. We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future.

Please check LLM best known practice for instructions to install/setup environment and example scripts.

@@ -853,7 +875,7 @@

Distributed InferenceSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html b/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html index 67a840168..3f7e17c55 100644 --- a/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html +++ b/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html @@ -271,7 +271,7 @@

Distributed Inference with DeepSpeedSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance.html b/cpu/2.5.0+cpu/tutorials/performance.html index 3035ceb5a..0ed45be76 100644 --- a/cpu/2.5.0+cpu/tutorials/performance.html +++ b/cpu/2.5.0+cpu/tutorials/performance.html @@ -1038,7 +1038,7 @@

Hardware ConfigurationSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html b/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html index 60c029aa6..c4f061b05 100644 --- a/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html +++ b/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html @@ -835,7 +835,7 @@

GNU OpenMP LibrarySphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html b/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html index 3bd16b73c..d21bff958 100644 --- a/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html +++ b/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html @@ -462,7 +462,7 @@

Performance Boost with Intel® Extension for PyTorch* and LauncherSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html b/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html index 1c2ef6a06..cac1039fc 100644 --- a/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html +++ b/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html @@ -366,7 +366,7 @@

OneDNN primitive cacheSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/releases.html b/cpu/2.5.0+cpu/tutorials/releases.html index 16fe282ca..8759d1240 100644 --- a/cpu/2.5.0+cpu/tutorials/releases.html +++ b/cpu/2.5.0+cpu/tutorials/releases.html @@ -58,101 +58,105 @@
  • Large Language Models (LLM)
  • Performance
  • Releases
      -
    • 2.4.0
        +
      • 2.5.0
      • -
      • 2.3.100
          +
        • 2.4.0
        • -
        • 2.3.0
            +
          • 2.3.100
          • -
          • 2.2.0
              +
            • 2.3.0
            • -
            • 2.1.100
                +
              • 2.2.0
              • -
              • 2.1.0
                  +
                • 2.1.100
                • -
                • 2.0.100
                    +
                  • 2.1.0
                  • -
                  • 2.0.0
                      +
                    • 2.0.100
                    • -
                    • 1.13.100
                        +
                      • 2.0.0
                      • -
                      • 1.13.0
                          +
                        • 1.13.100
                        • -
                        • 1.12.300
                            -
                          • Highlights
                          • +
                          • 1.13.0
                          • -
                          • 1.12.100
                          • -
                          • 1.12.0
                              -
                            • Highlights
                            • -
                            • Known Issues
                            • +
                            • 1.12.300
                            • -
                            • 1.11.200
                                -
                              • Highlights
                              • +
                              • 1.12.100
                              • +
                              • 1.12.0
                              • -
                              • 1.11.0
                                  +
                                • 1.11.200 +
                                • +
                                • 1.11.0
                                • -
                                • 1.10.100
                                • -
                                • 1.10.0
                                    -
                                  • Highlights
                                  • -
                                  • Known Issues
                                  • -
                                  • What’s Changed
                                  • +
                                  • 1.10.100
                                  • +
                                  • 1.10.0
                                  • -
                                  • 1.9.0
                                      +
                                    • 1.9.0
                                    • -
                                    • 1.8.0
                                        -
                                      • What’s New
                                      • +
                                      • 1.8.0
                                      • -
                                      • 1.2.0
                                          -
                                        • What’s New
                                        • +
                                        • 1.2.0
                                        • -
                                        • 1.1.0
                                            -
                                          • What’s New
                                          • +
                                          • 1.1.0
                                          • -
                                          • 1.0.2
                                          • +
                                          • 1.0.2
                                          • 1.0.1-Alpha
                                          • -
                                          • 1.0.0-Alpha
                                              -
                                            • What’s New
                                            • +
                                            • 1.0.0-Alpha
                                            • @@ -211,11 +215,40 @@

                                              Releases

                                              -

                                              2.4.0

                                              -

                                              We are excited to announce the release of Intel® Extension for PyTorch* 2.4.0+cpu which accompanies PyTorch 2.4. This release mainly brings you the support for Llama3.1, basic support for LLM serving frameworks like vLLM/TGI, and a set of optimization to push better performance for LLM models. This release also extends the list of optimized LLM models to a broader level and includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product.

                                              +

                                              2.5.0

                                              +

                                              We are excited to announce the release of Intel® Extension for PyTorch* 2.5.0+cpu which accompanies PyTorch 2.5. This release mainly brings you the support for Llama3.2, optimization on newly launched Intel® Xeon® 6 P-core platform, GPTQ/AWQ format support, and latest optimization to push better performance for LLM models. This release also includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product.

                                              Highlights

                                                +
                                              • Llama 3.2 support

                                              • +
                                              +

                                              Meta has newly released Llama 3.2, which includes small and medium-sized vision LLMs (11B and 90B), and lightweight, text-only models (1B and 3B). Intel® Extension for PyTorch* provides support of Llama 3.2 since its launch date with early release version, and now support with this official release.

                                              +
                                                +
                                              • Optimization for Intel® Xeon® 6 +Intel® Xeon® 6 deliver new degrees of performance with more cores, a choice of microarchitecture, additional memory bandwidth, and exceptional input/output (I/O) across a range of workloads. Intel® Extension for PyTorch* provides dedicated optimization on this new processor family for features like Multiplexed Rank DIMM (MRDIMM), SNC=3 scenario, etc..

                                              • +
                                              • Large Language Model (LLM) optimization: +Intel® Extension for PyTorch* provides more feature support of the weight only quantization including GPTQ/AWQ format support, symmetric quantization of activation and weight, and added chunked prefill/prefix prefill support in LLM module API, etc.. These features enable better adoption of community model weight and provides better performance for low-precision scenarios. This release also extended the optimized models to include newly published Llama 3.2 vision models. A full list of optimized models can be found at LLM optimization.

                                              • +
                                              • Bug fixing and other optimization

                                                +
                                                  +
                                                • Optimized the performance of the IndirectAccessKVCacheAttention kernel +#3185 #3209 #3214 #3218 #3248

                                                • +
                                                • Fixed the Segmentation fault in the IndirectAccessKVCacheAttention kernel #3246

                                                • +
                                                • Fixed the correctness issue in the PagedAttention kernel for Llama-68M-Chat-v1 #3307

                                                • +
                                                • Fixed the support in ipex.llm.optimize to ensure model.generate returns the correct output type when return_dict_in_generate is set to True. #3333

                                                • +
                                                • Optimized the performance of the Flash Attention kernel #3291

                                                • +
                                                • Upgraded oneDNN to v3.6 #3305

                                                • +
                                                +
                                              • +
                                              +

                                              Full Changelog: https://github.com/intel/intel-extension-for-pytorch/compare/v2.4.0+cpu…v2.5.0+cpu

                                              +
                                              +
                                              +
                                              +

                                              2.4.0

                                              +

                                              We are excited to announce the release of Intel® Extension for PyTorch* 2.4.0+cpu which accompanies PyTorch 2.4. This release mainly brings you the support for Llama3.1, basic support for LLM serving frameworks like vLLM/TGI, and a set of optimization to push better performance for LLM models. This release also extends the list of optimized LLM models to a broader level and includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product.

                                              +
                                              +

                                              Highlights

                                              +
                                              • Llama 3.1 support

                                              Meta has newly released Llama 3.1 with new features like longer context length (128K) support. Intel® Extension for PyTorch* provides support of Llama 3.1 since its launch date with early release version, and now support with this official release.

                                              @@ -241,10 +274,10 @@

                                              Highlights -

                                              2.3.100

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              2.3.100

                                              +
                                              +

                                              Highlights

                                              -
                                              -

                                              2.3.0

                                              +
                                              +

                                              2.3.0

                                              We are excited to announce the release of Intel® Extension for PyTorch* 2.3.0+cpu which accompanies PyTorch 2.3. This release mainly brings you the new feature on Large Language Model (LLM) called module level LLM optimization API, which provides module level optimizations for commonly used LLM modules and functionalities, and targets to optimize customized LLM modeling for scenarios like private models, self-customized models, LLM serving frameworks, etc. This release also extends the list of optimized LLM models to a broader level and includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product.

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              Highlights

                                              -
                                              -

                                              2.2.0

                                              +
                                              +

                                              2.2.0

                                              We are excited to announce the release of Intel® Extension for PyTorch* 2.2.0+cpu which accompanies PyTorch 2.2. This release mainly brings in our latest optimization on Large Language Model (LLM) including new dedicated API set (ipex.llm), new capability for auto-tuning accuracy recipe for LLM, and a broader list of optimized LLM models, together with a set of bug fixing and small optimization. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product.

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              Highlights

                                              • Large Language Model (LLM) optimization

                                                Intel® Extension for PyTorch* provides a new dedicated module, ipex.llm, to host for Large Language Models (LLMs) specific APIs. With ipex.llm, Intel® Extension for PyTorch* provides comprehensive LLM optimization cross various popular datatypes including FP32/BF16/INT8/INT4. Specifically for low precision, both SmoothQuant and Weight-Only quantization are supported for various scenarios. And user can also run Intel® Extension for PyTorch* with Tensor Parallel to fit in the multiple ranks or multiple nodes scenarios to get even better performance.

                                                @@ -362,10 +395,10 @@

                                                Highlights

                                                Full Changelog: https://github.com/intel/intel-extension-for-pytorch/compare/v2.1.100+cpu…v2.2.0+cpu

                                              -
                                              -

                                              2.1.100

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              2.1.100

                                              +
                                              +

                                              Highlights

                                              -
                                              -

                                              2.1.0

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              2.1.0

                                              +
                                              +

                                              Highlights

                                              • Large Language Model (LLM) optimization (Experimental): Intel® Extension for PyTorch* provides a lot of specific optimizations for LLMs in this new release. In operator level, we provide highly efficient GEMM kernel to speedup Linear layer and customized operators to reduce the memory footprint. To better trade-off the performance and accuracy, different low-precision solutions e.g., smoothQuant for INT8 and weight-only-quantization for INT4 and INT8 are also enabled. Besides, tensor parallel can also be adopt to get lower latency for LLMs.

                                                A new API function, ipex.optimize_transformers, is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs). It provides optimizations for both model-wise and content-generation-wise. You just need to invoke the ipex.optimize_transformers function instead of the ipex.optimize function to apply all optimizations transparently. More detailed information can be found at Large Language Model optimizations overview.

                                                @@ -407,10 +440,10 @@

                                                Highlights

                                              -
                                              -

                                              2.0.100

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              2.0.100

                                              +
                                              +

                                              Highlights

                                              -
                                              -

                                              2.0.0

                                              +
                                              +

                                              2.0.0

                                              We are pleased to announce the release of Intel® Extension for PyTorch* 2.0.0-cpu which accompanies PyTorch 2.0. This release mainly brings in our latest optimization on NLP (BERT), support of PyTorch 2.0’s hero API –- torch.compile as one of its backend, together with a set of bug fixing and small optimization.

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              Highlights

                                              • Fast BERT optimization (Experimental): Intel introduced a new technique to speed up BERT workloads. Intel® Extension for PyTorch* integrated this implementation, which benefits BERT model especially training. A new API ipex.fast_bert is provided to try this new optimization. More detailed information can be found at Fast Bert Feature.

                                              • MHA optimization with Flash Attention: Intel optimized MHA module with Flash Attention technique as inspired by Stanford paper. This brings less memory consumption for LLM, and also provides better inference performance for models like BERT, Stable Diffusion, etc.

                                              • @@ -461,10 +494,10 @@

                                                Known IssuesKnown Issues webpage.

                                              -
                                              -

                                              1.13.100

                                              -
                                              -

                                              Highlights

                                              +
                                              +

                                              1.13.100

                                              +
                                              +

                                              Highlights

                                              -
                                              -

                                              1.13.0

                                              +
                                              +

                                              1.13.0

                                              We are pleased to announce the release of Intel® Extension for PyTorch* 1.13.0-cpu which accompanies PyTorch 1.13. This release is highlighted with quite a few usability features which help users to get good performance and accuracy on CPU with less effort. We also added a couple of performance features as always. Check out the feature summary below.

                                              • Usability Features

                                              • @@ -495,8 +528,8 @@

                                                1.13.0

                                                Packed MKL SGEMM landed as the default kernel option for FP32 Linear, bringing up-to 20% geomean speedup for real-time NLP tasks.

                                              • DL compiler is now turned on by default with oneDNN fusion and gives additional performance boost for INT8 models.

                                              • -
                                                -

                                                Highlights

                                                +
                                                +

                                                Highlights

                                                • Automatic channels last format conversion: Channels last conversion is now applied to PyTorch modules automatically with ipex.optimize by default for both training and inference scenarios. Users don’t have to explicitly convert input and weight for CV models.

                                                  import intel_extension_for_pytorch as ipex
                                                  @@ -571,15 +604,15 @@ 

                                                  Highlights

                                                -
                                                -

                                                Known Issues

                                                +
                                                +

                                                Known Issues

                                                Please check at Known Issues webpage.

                                                -
                                                -

                                                1.12.300

                                                -
                                                -

                                                Highlights

                                                +
                                                +

                                                1.12.300

                                                +
                                                +

                                                Highlights

                                                -
                                                -

                                                1.12.100

                                                +
                                                +

                                                1.12.100

                                                This is a patch release to fix the AVX2 issue that blocks running on non-AVX512 platforms.

                                                -
                                                -

                                                1.12.0

                                                +
                                                +

                                                1.12.0

                                                We are excited to bring you the release of Intel® Extension for PyTorch* 1.12.0-cpu, by tightly following PyTorch 1.12 release. In this release, we matured the automatic int8 quantization and made it a stable feature. We stabilized runtime extension and brought about a MultiStreamModule feature to further boost throughput in offline inference scenario. We also brought about various enhancements in operation and graph which are positive for performance of broad set of workloads.

                                                Highlights include:

                                                  @@ -603,8 +636,8 @@

                                                  1.12.0

                                                  More optimizations in graph and operations to improve performance of broad set of models, examples include but not limited to wave2vec, T5, Albert etc.

                                                • Pre-built experimental binary with oneDNN Graph Compiler tuned on would deliver additional performance gain for Bert, Albert, Roberta in INT8 inference.

                                                -
                                                -

                                                Highlights

                                                +
                                                +

                                                Highlights

                                                • Matured automatic INT8 quantization feature baking into a well-tuned default quantization recipe. We facilitated the user experience and provided a wide range of calibration algorithms like Histogram, MinMax, MovingAverageMinMax, etc. Meanwhile, We polished the static quantization with better flexibility and enabled dynamic quantization as well. Compared to the previous version, the brief changes are as follows. Refer to tutorial page for more details.

                                                  @@ -730,8 +763,8 @@

                                                  Highlights -
                                                  -

                                                  Known Issues

                                                  +
                                                  +

                                                  Known Issues

                                                  • RuntimeError: Overflow when unpacking long when a tensor’s min max value exceeds int range while performing int8 calibration. Please customize QConfig to use min-max calibration method.

                                                  • Calibrating with quantize_per_tensor, when benchmarking with 1 OpenMP* thread, results might be incorrect with large tensors (find more detailed info here. Editing your code following the pseudocode below can workaround this issue, if you do need to explicitly set OMP_NUM_THREAEDS=1 for benchmarking. However, there could be a performance regression if oneDNN graph compiler prototype feature is utilized.

                                                    @@ -791,10 +824,10 @@

                                                    Known Issues

                                                  -
                                                  -

                                                  1.11.200

                                                  -
                                                  -

                                                  Highlights

                                                  +
                                                  +

                                                  1.11.200

                                                  +
                                                  +

                                                  Highlights

                                                  • Enable more fused operators to accelerate particular models.

                                                  • Fuse Convolution and LeakyReLU (#648)

                                                  • @@ -807,8 +840,8 @@

                                                    Highlights

                                                    Full Changelog

                                                  -
                                                  -

                                                  1.11.0

                                                  +
                                                  +

                                                  1.11.0

                                                  We are excited to announce Intel® Extension for PyTorch* 1.11.0-cpu release by tightly following PyTorch 1.11 release. Along with extension 1.11, we focused on continually improving OOB user experience and performance. Highlights include:

                                                  • Support a single binary with runtime dynamic dispatch based on AVX2/AVX512 hardware ISA detection

                                                  • @@ -817,8 +850,8 @@

                                                    1.11.0

                                                    Add more optimizations, including graph fusions for speeding up Transformer-based models and CNN, etc

                                                  • Reduce the binary size for both the PIP wheel and C++ SDK (2X to 5X reduction from the previous version)

                                                  -
                                                  -

                                                  Highlights

                                                  +
                                                  +

                                                  Highlights

                                                  • Combine the AVX2 and AVX512 binary as a single binary and automatically dispatch to different implementations based on hardware ISA detection at runtime. The typical case is to serve the data center that mixtures AVX2-only and AVX512 platforms. It does not need to deploy the different ISA binary now compared to the previous version

                                                    NOTE: The extension uses the oneDNN library as the backend. However, the BF16 and INT8 operator sets and features are different between AVX2 and AVX512. Refer to oneDNN document for more details.

                                                    @@ -887,8 +920,8 @@

                                                    What’s Changed -

                                                    1.10.100

                                                    +
                                                    +

                                                    1.10.100

                                                    This release is meant to fix the following issues:

                                                    • Resolve the issue that the PyTorch Tensor Expression(TE) did not work after importing the extension.

                                                    • @@ -901,12 +934,12 @@

                                                      1.10.100<

                                                    -
                                                    -

                                                    1.10.0

                                                    +
                                                    +

                                                    1.10.0

                                                    The Intel® Extension for PyTorch* 1.10 is on top of PyTorch 1.10. In this release, we polished the front end APIs. The APIs are more simple, stable, and straightforward now. According to PyTorch community recommendation, we changed the underhood device from XPU to CPU. With this change, the model and tensor does not need to be converted to the extension device to get performance improvement. It simplifies the model changes.

                                                    Besides that, we continuously optimize the Transformer* and CNN models by fusing more operators and applying NHWC. We measured the 1.10 performance on Torchvison and HugginFace. As expected, 1.10 can speed up the two model zones.

                                                    -
                                                    -

                                                    Highlights

                                                    +
                                                    +

                                                    Highlights

                                                    • Change the package name to intel_extension_for_pytorch while the original package name is intel_pytorch_extension. This change targets to avoid any potential legal issues.

                                                    @@ -1120,8 +1153,8 @@

                                                    Highlights
                                                  • Runtime Extension (Experimental) provides a runtime CPU pool API to bind threads to cores. It also features async tasks. Note: Intel® Extension for PyTorch* Runtime extension is still in the experimental stage. The API is subject to change. More detailed descriptions are available in the extension documentation.

                                                  -
                                                  -

                                                  Known Issues

                                                  +
                                                  +

                                                  Known Issues

                                                  • omp_set_num_threads function failed to change OpenMP threads number of oneDNN operators if it was set before.

                                                    omp_set_num_threads function is provided in Intel® Extension for PyTorch* to change the number of threads used with OpenMP. However, it failed to change the number of OpenMP threads if it was set before.

                                                    @@ -1142,13 +1175,13 @@

                                                    Known Issues

                                                  -
                                                  -

                                                  What’s Changed

                                                  +
                                                  +

                                                  What’s Changed

                                                  Full Changelog: https://github.com/intel/intel-extension-for-pytorch/compare/v1.9.0…v1.10.0+cpu-rc3

                                                  -
                                                  -

                                                  1.9.0

                                                  +
                                                  +

                                                  1.9.0

                                                  What’s New

                                                    @@ -1160,10 +1193,10 @@

                                                    What’s New -

                                                    1.8.0

                                                    -
                                                    -

                                                    What’s New

                                                    +
                                                    +

                                                    1.8.0

                                                    +
                                                    +

                                                    What’s New

                                                    • Rebased the Intel Extension for Pytorch from Pytorch -1.7.0 to the official Pytorch-1.8.0 release. The new XPU device type has been added into Pytorch-1.8.0(49786), don’t need to patch PyTorch to enable Intel Extension for Pytorch anymore

                                                    • Upgraded the oneDNN from v1.5-rc to v1.8.1

                                                    • @@ -1171,10 +1204,10 @@

                                                      What’s New

                                                    -
                                                    -

                                                    1.2.0

                                                    -
                                                    -

                                                    What’s New

                                                    +
                                                    +

                                                    1.2.0

                                                    +
                                                    +

                                                    What’s New

                                                    • We rebased the Intel Extension for pytorch from Pytorch -1.5rc3 to the official Pytorch-1.7.0 release. It will have performance improvement with the new Pytorch-1.7 support.

                                                    • Device name was changed from DPCPP to XPU.

                                                      @@ -1211,17 +1244,17 @@

                                                      Others<

                                                    -
                                                    -

                                                    Known issues

                                                    +
                                                    +

                                                    Known issues

                                                    • Multi-node training still encounter hang issues after several iterations. The fix will be included in the next official release.

                                                    -
                                                    -

                                                    1.1.0

                                                    -
                                                    -

                                                    What’s New

                                                    +
                                                    +

                                                    1.1.0

                                                    +
                                                    +

                                                    What’s New

                                                    -
                                                    -

                                                    1.0.0-Alpha

                                                    -
                                                    -

                                                    What’s New

                                                    +
                                                    +

                                                    1.0.0-Alpha

                                                    +
                                                    +

                                                    What’s New

                                                    • Auto Operator Optimization

                                                      Intel Extension for PyTorch will automatically optimize the operators of PyTorch when importing its python package. It will significantly improve the computation performance if the input tensor and the model is converted to the extension device.

                                                      @@ -1359,8 +1392,8 @@

                                                      Performance Result -

                                                      Known issue

                                                      +
                                                      +

                                                      Known issue

                                                      • #10 All data types have not been registered for DPCPP

                                                      • #37 MaxPool can’t get nan result when input’s value is nan

                                                      • @@ -1390,7 +1423,7 @@

                                                        NOTE< Built with Sphinx using a theme provided by Read the Docs. - +

                                                        © Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.