From ecb3791b354a1bf4b0479fe642e75d4cf09b79ae Mon Sep 17 00:00:00 2001
From: Jing Xu
Date: Tue, 5 Nov 2024 12:48:23 +0900
Subject: [PATCH] add release notes to 2.5.0 (#3360)
---
.../_sources/tutorials/examples.md.txt | 2 +-
.../tutorials/features/fast_bert.md.txt | 4 +-
.../_sources/tutorials/getting_started.md.txt | 5 +-
.../_sources/tutorials/installation.md.txt | 4 +-
.../_sources/tutorials/introduction.rst.txt | 2 +-
.../_sources/tutorials/releases.md.txt | 27 ++
.../_static/htmls/tbl_deepspeed.html | 28 +-
cpu/2.5.0+cpu/_static/htmls/tbl_single.html | 120 +++----
cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html | 2 +-
cpu/2.5.0+cpu/genindex.html | 2 +-
cpu/2.5.0+cpu/index.html | 2 +-
cpu/2.5.0+cpu/py-modindex.html | 2 +-
cpu/2.5.0+cpu/search.html | 2 +-
cpu/2.5.0+cpu/searchindex.js | 2 +-
cpu/2.5.0+cpu/tutorials/api_doc.html | 2 +-
.../tutorials/blogs_publications.html | 2 +-
cpu/2.5.0+cpu/tutorials/cheat_sheet.html | 2 +-
cpu/2.5.0+cpu/tutorials/contribution.html | 2 +-
cpu/2.5.0+cpu/tutorials/examples.html | 4 +-
cpu/2.5.0+cpu/tutorials/features.html | 2 +-
cpu/2.5.0+cpu/tutorials/features/amp.html | 2 +-
.../features/auto_channels_last.html | 2 +-
.../features/codeless_optimization.html | 2 +-
.../tutorials/features/fast_bert.html | 6 +-
.../tutorials/features/graph_capture.html | 2 +-
.../features/graph_optimization.html | 2 +-
.../tutorials/features/hypertune.html | 2 +-
.../tutorials/features/int8_overview.html | 2 +-
.../features/int8_recipe_tuning_api.html | 2 +-
.../features/isa_dynamic_dispatch.html | 2 +-
cpu/2.5.0+cpu/tutorials/features/nhwc.html | 2 +-
.../tutorials/features/optimizer_fusion.html | 2 +-
.../tutorials/features/runtime_extension.html | 2 +-
.../tutorials/features/split_sgd.html | 2 +-
.../features/sq_recipe_tuning_api.html | 2 +-
cpu/2.5.0+cpu/tutorials/getting_started.html | 6 +-
cpu/2.5.0+cpu/tutorials/installation.html | 6 +-
cpu/2.5.0+cpu/tutorials/introduction.html | 4 +-
cpu/2.5.0+cpu/tutorials/known_issues.html | 2 +-
cpu/2.5.0+cpu/tutorials/license.html | 2 +-
cpu/2.5.0+cpu/tutorials/llm.html | 150 +++++----
cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html | 2 +-
cpu/2.5.0+cpu/tutorials/performance.html | 2 +-
.../performance_tuning/launch_script.html | 2 +-
.../performance_tuning/torchserve.html | 2 +-
.../performance_tuning/tuning_guide.html | 2 +-
cpu/2.5.0+cpu/tutorials/releases.html | 299 ++++++++++--------
47 files changed, 417 insertions(+), 314 deletions(-)
diff --git a/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt
index 27a737e86..809d9046d 100644
--- a/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt
+++ b/cpu/2.5.0+cpu/_sources/tutorials/examples.md.txt
@@ -502,7 +502,7 @@ print("Execution finished")
import torch
from transformers import BertModel
-model = BertModel.from_pretrained("bert-base-uncased")
+model = BertModel.from_pretrained("bert-base-uncased", attn_implementation="eager")
model.eval()
vocab_size = model.config.vocab_size
diff --git a/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt
index d0621131f..848fd307d 100644
--- a/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt
+++ b/cpu/2.5.0+cpu/_sources/tutorials/features/fast_bert.md.txt
@@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc
### Prerequisite
-- Transformers 4.6.0 ~ 4.43.2
+- Transformers 4.6.0 ~ 4.45.0
### Usage Example
@@ -20,7 +20,7 @@ An API `ipex.fast_bert` is provided for a simple usage. Usage of this API follow
import torch
from transformers import BertModel
-model = BertModel.from_pretrained("bert-base-uncased")
+model = BertModel.from_pretrained("bert-base-uncased", attn_implementation="eager")
model.eval()
vocab_size = model.config.vocab_size
diff --git a/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt
index 67874f6d4..ee11aabc7 100644
--- a/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt
+++ b/cpu/2.5.0+cpu/_sources/tutorials/getting_started.md.txt
@@ -1,6 +1,6 @@
# Quick Start
-The following instructions assume you have installed the Intel® Extension for PyTorch\*. For installation instructions, refer to [Installation](../../../index.html#installation?platform=cpu&version=main).
+The following instructions assume you have installed the Intel® Extension for PyTorch\*. For installation instructions, refer to [Installation](../../../index.html#installation?platform=cpu&version=v2.5.0%2Bcpu).
To start using the Intel® Extension for PyTorch\* in your code, you need to make the following changes:
@@ -64,7 +64,6 @@ In [Cheat Sheet](./cheat_sheet.md), you can find more commands that can help you
`ipex.llm.optimize` is used for Large Language Models (LLM).
-
```python
import torch
#################### code changes ####################
@@ -157,4 +156,4 @@ with torch.inference_mode(), torch.cpu.amp.autocast(enabled=amp_enabled):
print(gen_text, total_new_tokens, flush=True)
```
-More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) section.
+More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/release/2.5/examples/cpu/llm) section.
diff --git a/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt
index 707a091db..567dd2d38 100644
--- a/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt
+++ b/cpu/2.5.0+cpu/_sources/tutorials/installation.md.txt
@@ -1,8 +1,8 @@
Installation
============
-Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.4.0%2Bcpu).
+Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.5.0%2Bcpu).
After successful installation, refer to the [Quick Start](getting_started.md) and [Examples](examples.md) sections to start using the extension in your code.
-**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm).
+**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.5.0%2Bcpu/examples/cpu/llm).
diff --git a/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt b/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt
index 8037db666..1c4309dfb 100644
--- a/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt
+++ b/cpu/2.5.0+cpu/_sources/tutorials/introduction.rst.txt
@@ -16,7 +16,7 @@ the `Large Language Models (LLM) `_ section.
Get Started
-----------
-- `Installation <../../../index.html#installation?platform=cpu&version=v2.4.0%2Bcpu>`_
+- `Installation <../../../index.html#installation?platform=cpu&version=v2.5.0%2Bcpu>`_
- `Quick Start `_
- `Examples `_
diff --git a/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt b/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt
index 3ee67a92e..70c8fbed0 100644
--- a/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt
+++ b/cpu/2.5.0+cpu/_sources/tutorials/releases.md.txt
@@ -1,6 +1,33 @@
Releases
========
+## 2.5.0
+
+We are excited to announce the release of Intel® Extension for PyTorch* 2.5.0+cpu which accompanies PyTorch 2.5. This release mainly brings you the support for Llama3.2, optimization on newly launched Intel® Xeon® 6 P-core platform, GPTQ/AWQ format support, and latest optimization to push better performance for LLM models. This release also includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product.
+
+### Highlights
+
+* Llama 3.2 support
+
+Meta has newly released [Llama 3.2](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), which includes small and medium-sized vision LLMs (11B and 90B), and lightweight, text-only models (1B and 3B). Intel® Extension for PyTorch* provides [support of Llama 3.2](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-ai-solutions-support-the-new-llama-3-2-model.html) since its launch date with early release version, and now support with this official release.
+
+* Optimization for Intel® Xeon® 6
+Intel® Xeon® 6 deliver new degrees of performance with more cores, a choice of microarchitecture, additional memory bandwidth, and exceptional input/output (I/O) across a range of workloads. Intel® Extension for PyTorch* provides dedicated optimization on this new processor family for features like Multiplexed Rank DIMM (MRDIMM), SNC=3 scenario, etc..
+
+* Large Language Model (LLM) optimization:
+Intel® Extension for PyTorch* provides more feature support of the weight only quantization including GPTQ/AWQ format support, symmetric quantization of activation and weight, and added chunked prefill/prefix prefill support in LLM module API, etc.. These features enable better adoption of community model weight and provides better performance for low-precision scenarios. This release also extended the optimized models to include newly published Llama 3.2 vision models. A full list of optimized models can be found at [LLM optimization](https://github.com/intel/intel-extension-for-pytorch/tree/v2.5.0+cpu/examples/cpu/llm/inference).
+
+* Bug fixing and other optimization
+ - Optimized the performance of the IndirectAccessKVCacheAttention kernel
+[#3185](https://github.com/intel/intel-extension-for-pytorch/commit/8572e1faf97998783ea2a7fc6ee3094090feebc4) [#3209](https://github.com/intel/intel-extension-for-pytorch/commit/65e96630a2e17f7b762c5c765f10264ad08db098) [#3214](https://github.com/intel/intel-extension-for-pytorch/commit/a04214f7ab4e43648d75abdcf0fae53e5076be2b) [#3218](https://github.com/intel/intel-extension-for-pytorch/commit/f219012ab1babbc67c9b545fa7251cd981a2a3a2) [#3248](https://github.com/intel/intel-extension-for-pytorch/commit/9f6178eb028d36b3ed1f5985e57b7cf160acf38a)
+ - Fixed the Segmentation fault in the IndirectAccessKVCacheAttention kernel [#3246](https://github.com/intel/intel-extension-for-pytorch/commit/bee5ab644086c9b25eb61916c6773932c74667d3)
+ - Fixed the correctness issue in the PagedAttention kernel for Llama-68M-Chat-v1 [#3307](https://github.com/intel/intel-extension-for-pytorch/commit/638a7d26acb33af450ea9869b5b43ccdbe0e962b)
+ - Fixed the support in `ipex.llm.optimize` to ensure `model.generate` returns the correct output type when `return_dict_in_generate` is set to `True`. [#3333](https://github.com/intel/intel-extension-for-pytorch/commit/584a4e2e2c6193b926554f951d2608489cac5d7a)
+ - Optimized the performance of the Flash Attention kernel [#3291](https://github.com/intel/intel-extension-for-pytorch/commit/8fb43ec45ed93b62efef07f4b2e8dcd7dd502b8b)
+ - Upgraded oneDNN to v3.6 [#3305](https://github.com/intel/intel-extension-for-pytorch/commit/91639fa0812ee3c12c672002c2bf5cf1cac4bc0a)
+
+**Full Changelog**: https://github.com/intel/intel-extension-for-pytorch/compare/v2.4.0+cpu...v2.5.0+cpu
+
## 2.4.0
We are excited to announce the release of Intel® Extension for PyTorch\* 2.4.0+cpu which accompanies PyTorch 2.4. This release mainly brings you the support for Llama3.1, basic support for LLM serving frameworks like vLLM/TGI, and a set of optimization to push better performance for LLM models. This release also extends the list of optimized LLM models to a broader level and includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product.
diff --git a/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html b/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html
index 2751ef96a..15fc0edf3 100644
--- a/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html
+++ b/cpu/2.5.0+cpu/_static/htmls/tbl_deepspeed.html
@@ -44,6 +44,18 @@
🟩 |
🟩 |
+
+ LLAMA |
+ meta-llama/Llama-3.2-3B-Instruct |
+ 🟩 |
+ 🟩 |
+
+
+ LLAMA |
+ meta-llama/Llama-3.2-11B-Vision-Instruct |
+ 🟩 |
+ 🟩 |
+
GPT-J |
EleutherAI/gpt-j-6b |
@@ -53,13 +65,13 @@
GPT-NEOX |
EleutherAI/gpt-neox-20b |
- 🟨 |
+ 🟩 |
🟩 |
DOLLY |
databricks/dolly-v2-12b |
- 🟨 |
+ 🟩 |
🟩 |
@@ -77,7 +89,7 @@
OPT |
facebook/opt-30b |
- 🟨 |
+ 🟩 |
🟩 |
@@ -89,7 +101,7 @@
Bloom |
bigscience/bloom-1b7 |
- 🟨 |
+ 🟩 |
🟩 |
@@ -113,7 +125,7 @@
Baichuan |
baichuan-inc/Baichuan-13B-Chat |
- 🟨 |
+ 🟩 |
🟩 |
@@ -207,8 +219,4 @@
🟩 |
-
-
- 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).
- 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/cpu/2.5.0+cpu/_static/htmls/tbl_single.html b/cpu/2.5.0+cpu/_static/htmls/tbl_single.html
index 4ad7a2284..ef5f9c86a 100644
--- a/cpu/2.5.0+cpu/_static/htmls/tbl_single.html
+++ b/cpu/2.5.0+cpu/_static/htmls/tbl_single.html
@@ -16,9 +16,9 @@
meta-llama/Llama-2-7b-hf |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
LLAMA |
@@ -43,16 +43,16 @@
meta-llama/Meta-Llama-3-8B |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
LLAMA |
meta-llama/Meta-Llama-3-70B |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
🟩 |
🟩 |
@@ -61,10 +61,28 @@
meta-llama/Meta-Llama-3.1-8B-Instruct |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
🟩 |
🟩 |
+
+ LLAMA |
+ meta-llama/Llama-3.2-3B-Instruct |
+ 🟩 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
+
+
+ LLAMA |
+ meta-llama/Llama-3.2-11B-Vision-Instruct |
+ 🟩 |
+ 🟩 |
+ |
+ 🟩 |
+ |
+
GPT-J |
EleutherAI/gpt-j-6b |
@@ -78,19 +96,19 @@
GPT-NEOX |
EleutherAI/gpt-neox-20b |
🟩 |
- 🟨 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
DOLLY |
databricks/dolly-v2-12b |
🟩 |
- 🟨 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
FALCON |
@@ -99,7 +117,7 @@
🟩 |
🟩 |
🟩 |
- |
+ 🟩 |
FALCON |
@@ -108,7 +126,7 @@
🟩 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
FALCON |
@@ -126,7 +144,7 @@
🟩 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
OPT |
@@ -135,16 +153,16 @@
🟩 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
Bloom |
bigscience/bloom-1b7 |
🟩 |
- 🟨 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
CodeGen |
@@ -162,34 +180,34 @@
🟩 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
Baichuan |
baichuan-inc/Baichuan2-13B-Chat |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Baichuan |
baichuan-inc/Baichuan-13B-Chat |
🟩 |
- 🟨 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
ChatGLM |
THUDM/chatglm3-6b |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
ChatGLM |
@@ -198,23 +216,23 @@
🟩 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
GPTBigCode |
bigcode/starcoder |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
T5 |
google/flan-t5-xl |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
🟩 |
|
@@ -232,9 +250,9 @@
mistralai/Mistral-7B-v0.1 |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Mixtral |
@@ -243,34 +261,34 @@
🟩 |
|
🟩 |
- 🟨 |
+ 🟩 |
Stablelm |
stabilityai/stablelm-2-1_6b |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Qwen |
Qwen/Qwen-7B-Chat |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Qwen |
Qwen/Qwen2-7B |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
LLaVA |
@@ -296,7 +314,7 @@
🟩 |
🟩 |
|
- 🟨 |
+ 🟩 |
|
@@ -306,43 +324,43 @@
🟩 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
Phi |
microsoft/Phi-3-mini-4k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Phi |
microsoft/Phi-3-mini-128k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Phi |
microsoft/Phi-3-medium-4k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Phi |
microsoft/Phi-3-medium-128k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Whisper |
@@ -354,8 +372,4 @@
|
-
-
- 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).
- 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html b/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html
index 7b6547cb6..69d9f100f 100644
--- a/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html
+++ b/cpu/2.5.0+cpu/design_doc/cpu/isa_dyndisp.html
@@ -125,7 +125,7 @@ Intel® Extension for PyTorch* CPU ISA Dynamic Dispatch Design DocSphinx using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/genindex.html b/cpu/2.5.0+cpu/genindex.html
index 2277ac96e..7d5f93d36 100644
--- a/cpu/2.5.0+cpu/genindex.html
+++ b/cpu/2.5.0+cpu/genindex.html
@@ -377,7 +377,7 @@ V
Built with Sphinx using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/index.html b/cpu/2.5.0+cpu/index.html
index 5c8318428..4eb3badab 100644
--- a/cpu/2.5.0+cpu/index.html
+++ b/cpu/2.5.0+cpu/index.html
@@ -183,7 +183,7 @@ SupportSphinx using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/py-modindex.html b/cpu/2.5.0+cpu/py-modindex.html
index 8c9e02bb8..fff29e5f4 100644
--- a/cpu/2.5.0+cpu/py-modindex.html
+++ b/cpu/2.5.0+cpu/py-modindex.html
@@ -165,7 +165,7 @@ Python Module Index
Built with Sphinx using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/search.html b/cpu/2.5.0+cpu/search.html
index abb60a5a0..c53598003 100644
--- a/cpu/2.5.0+cpu/search.html
+++ b/cpu/2.5.0+cpu/search.html
@@ -140,7 +140,7 @@
Built with Sphinx using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/searchindex.js b/cpu/2.5.0+cpu/searchindex.js
index 082550705..5a4f1c84c 100644
--- a/cpu/2.5.0+cpu/searchindex.js
+++ b/cpu/2.5.0+cpu/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"$\\alpha$ Usage": [[16, "alpha-usage"]], "1. Creating a serialized file": [[32, "creating-a-serialized-file"]], "1. Defining hyperparameters to tune:": [[14, "defining-hyperparameters-to-tune"]], "1.0.0-Alpha": [[34, "id45"]], "1.0.1-Alpha": [[34, "alpha"]], "1.0.2": [[34, "id44"]], "1.1.0": [[34, "id42"]], "1.10.0": [[34, "id32"]], "1.10.100": [[34, "id31"]], "1.11.0": [[34, "id29"]], "1.11.200": [[34, "id27"]], "1.12.0": [[34, "id24"]], "1.12.100": [[34, "id23"]], "1.12.300": [[34, "id21"]], "1.13.0": [[34, "id18"]], "1.13.100": [[34, "id16"]], "1.2.0": [[34, "id39"]], "1.8.0": [[34, "id37"]], "1.9.0": [[34, "id36"]], "2. Creating a Model Archive": [[32, "creating-a-model-archive"]], "2. Defining the search spaces of the hyperparameters:": [[14, "defining-the-search-spaces-of-the-hyperparameters"]], "2.0.0": [[34, "id14"]], "2.0.100": [[34, "id12"]], "2.1.0": [[34, "id10"]], "2.1.100": [[34, "id8"]], "2.2.0": [[34, "id6"]], "2.3.0": [[34, "id4"]], "2.3.100": [[34, "id2"]], "2.4.0": [[34, "id1"]], "3. Start TorchServe to serve the model": [[32, "start-torchserve-to-serve-the-model"]], "4. Registering and Deploying model": [[32, "registering-and-deploying-model"]], "": [[14, "your-python-script"]], "API Documentation": [[2, null], [25, "api-documentation"]], "Accuracy": [[30, "accuracy"]], "Add Custom Kernel": [[17, "add-custom-kernel"]], "Algorithm: Auto-tuning of $\\alpha$.": [[16, "algorithm-auto-tuning-of-alpha"]], "Already using Jit Trace": [[10, "already-using-jit-trace"]], "Already using ipex.optimize": [[10, "already-using-ipex-optimize"]], "Architecture": [[1, "architecture"]], "Auto Channels Last": [[7, "auto-channels-last"], [9, null]], "Auto Mixed Precision (AMP)": [[7, "auto-mixed-precision-amp"], [8, null]], "Autocast Op Reference": [[8, "autocast-op-reference"]], "BERT": [[6, "bert"], [6, "id2"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [32, "bert"]], "BFloat16": [[6, "bfloat16"], [21, "bfloat16"], [26, "bfloat16"]], "Benchmarking with Launcher": [[32, "benchmarking-with-launcher"]], "Benchmarking with Launcher Core Pinning": [[32, "benchmarking-with-launcher-core-pinning"]], "Better local unit tests with pytest": [[5, "better-local-unit-tests-with-pytest"]], "Blogs & Publications": [[3, null]], "Building documentation": [[5, "building-documentation"]], "C++": [[6, "c"]], "C++ Unit Testing": [[5, "c-unit-testing"]], "CPU Channels Last Targets": [[18, "cpu-channels-last-targets"]], "CPU ISA build compiler requirement": [[17, "cpu-isa-build-compiler-requirement"]], "CPU Runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime"]], "CPU feature check": [[17, "cpu-feature-check"]], "Calibration": [[6, "calibration"]], "Channels Last": [[18, null], [33, "channels-last"]], "Cheat Sheet": [[4, null]], "Code Folder Struct": [[17, "code-folder-struct"]], "CodeGen Process": [[17, "codegen-process"]], "Codeless Optimization (Prototype)": [[10, null]], "Codeless Optimization (Prototype, NEW feature from 1.13.0)": [[7, "codeless-optimization-prototype-new-feature-from-1-13-0"]], "Command to apply ipex optimization for BF16": [[10, "command-to-apply-ipex-optimization-for-bf16"]], "Command to apply ipex optimization for FP32": [[10, "command-to-apply-ipex-optimization-for-fp32"]], "Configuration": [[30, "configuration"], [30, "id2"], [30, "id5"]], "Contents of this Document": [[32, "contents-of-this-document"], [33, "contents-of-this-document"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[5, "contributing-to-intel-extension-for-pytorch"]], "Contribution": [[5, null]], "Convert to Dynamic Quantized Model and Deploy": [[15, "convert-to-dynamic-quantized-model-and-deploy"]], "Convert to Static Quantized Model and Deploy": [[15, "convert-to-static-quantized-model-and-deploy"]], "Creating and Exporting INT8 model for Intel\u00ae Extension for PyTorch*": [[32, "creating-and-exporting-int8-model-for-intel-extension-for-pytorch"]], "Default Precision": [[8, "default-precision"]], "Default memory allocator": [[31, "default-memory-allocator"]], "Default search space": [[14, "default-search-space"]], "Define QConfig": [[15, "id1"]], "Define qconfig": [[15, "define-qconfig"]], "Defining hyperparameters and their search spaces": [[14, "defining-hyperparameters-and-their-search-spaces"]], "Demos": [[28, "demos"]], "Denormal Number": [[33, "denormal-number"]], "Deployment": [[6, "deployment"]], "Design of Task": [[20, "design-of-task"]], "Detail Design": [[20, "detail-design"]], "Determining the alpha through auto-tuning": [[16, "determining-the-alpha-through-auto-tuning"]], "Developing Intel\u00ae Extension for PyTorch*": [[5, "developing-intel-extension-for-pytorch"]], "Dispatch Stub implementation: csrc/cpu/dyndisp/DispatchStub.cpp and csrc/cpu/dyndisp/DispatchStub.h": [[17, "dispatch-stub-implementation-csrc-cpu-dyndisp-dispatchstub-cpp-and-csrc-cpu-dyndisp-dispatchstub-h"]], "Distributed Inference": [[28, "distributed-inference"]], "Distributed Inference with DeepSpeed": [[29, "distributed-inference-with-deepspeed"]], "Distributed Training": [[6, "distributed-training"]], "Dynamic Dispatch Design": [[17, "dynamic-dispatch-design"]], "Dynamic Quantization": [[6, "dynamic-quantization"], [15, "dynamic-quantization"]], "Dynamic Shape": [[26, "dynamic-shape"]], "Eager Mode": [[6, "eager-mode"], [6, "id5"]], "Ease-of-use auto channels last API": [[9, "ease-of-use-auto-channels-last-api"]], "Ease-of-use graph optimization API": [[13, "ease-of-use-graph-optimization-api"]], "Easy-to-use Python API": [[7, "easy-to-use-python-api"]], "Example Usage with HuggingFace": [[10, "example-usage-with-huggingface"]], "Example of MultiStream Module": [[20, "example-of-multistream-module"]], "Example of asynchronous task": [[20, "example-of-asynchronous-task"]], "Example of configuring core binding": [[20, "example-of-configuring-core-binding"]], "Example:": [[17, "example"], [17, "id1"]], "Examples": [[6, null]], "Examples1: Basic Usage": [[20, "examples1-basic-usage"]], "Examples2: Usage with \u201cAUTO\u201d setting": [[20, "examples2-usage-with-auto-setting"]], "Examples3: Usage for models with structure inputs/outputs": [[20, "examples3-usage-for-models-with-structure-inputs-outputs"]], "FP32 and BF16 fusion patterns": [[13, "fp32-and-bf16-fusion-patterns"]], "FP32 and BF16 models": [[13, "fp32-and-bf16-models"]], "FP32 and BFloat16 with v1.10": [[30, "fp32-and-bfloat16-with-v1-10"]], "FP32 with v1.11.200 on an AWS EC2 C6i.2xlarge instance": [[30, "fp32-with-v1-11-200-on-an-aws-ec2-c6i-2xlarge-instance"]], "FP32/BF16": [[6, "fp32-bf16"], [29, "fp32-bf16"]], "Fast BERT (Prototype)": [[11, null]], "Fast BERT Optimization (Prototype, NEW feature from 2.0.0)": [[7, "fast-bert-optimization-prototype-new-feature-from-2-0-0"]], "Fast Bert (Prototype)": [[2, "fast-bert-prototype"], [6, "fast-bert-prototype"]], "Feature Description": [[11, "feature-description"], [12, "feature-description"]], "Features": [[7, null]], "Float32": [[6, "float32"]], "Folding": [[13, "folding"]], "Fusion": [[13, "fusion"]], "GNU OpenMP": [[33, "gnu-openmp"]], "GNU OpenMP Library": [[31, "gnu-openmp-library"]], "General": [[2, "general"]], "General Usage": [[26, "general-usage"]], "Get Started": [[25, "get-started"]], "Graph Capture (Prototype)": [[12, null]], "Graph Capture (Prototype, NEW feature from 1.13.0)": [[7, "graph-capture-prototype-new-feature-from-1-13-0"]], "Graph Optimization": [[2, "graph-optimization"], [7, "graph-optimization"], [13, null], [28, "graph-optimization"]], "Hardware Configuration": [[30, "hardware-configuration"], [30, "id7"], [33, "hardware-configuration"]], "Highlights": [[34, "highlights"], [34, "id3"], [34, "id5"], [34, "id7"], [34, "id9"], [34, "id11"], [34, "id13"], [34, "id15"], [34, "id17"], [34, "id19"], [34, "id22"], [34, "id25"], [34, "id28"], [34, "id30"], [34, "id33"]], "How the core binding is implemented": [[20, "how-the-core-binding-is-implemented"]], "HyperTune (Prototype)": [[14, null]], "HyperTune (Prototype, NEW feature from 1.13.0)": [[7, "hypertune-prototype-new-feature-from-1-13-0"]], "Hyperparameters": [[14, "hyperparameters"]], "I. Use all physical cores": [[31, "i-use-all-physical-cores"]], "II. Use all cores including logical cores": [[31, "ii-use-all-cores-including-logical-cores"]], "III. Use physical cores on designated nodes": [[31, "iii-use-physical-cores-on-designated-nodes"]], "INT8": [[6, "int8"], [26, "int8"]], "INT8 Quantization": [[7, "int8-quantization"]], "INT8 Recipe Tuning API (Prototype)": [[16, null]], "INT8 fusion patterns": [[13, "int8-fusion-patterns"]], "INT8 models": [[13, "int8-models"]], "INT8 with v1.11": [[30, "int8-with-v1-11"]], "IOMP preload or load during the runtime": [[20, "iomp-preload-or-load-during-the-runtime"]], "ISA Dynamic Dispatching": [[7, "isa-dynamic-dispatching"], [17, null]], "ISA intrinics specific kernel example:": [[17, "isa-intrinics-specific-kernel-example"]], "IV. Use your designated number of cores": [[31, "iv-use-your-designated-number-of-cores"]], "Indirect Access KV Cache": [[28, "indirect-access-kv-cache"]], "Inference": [[6, "inference"]], "Inference with Eager Path": [[8, "inference-with-eager-path"]], "Inference with TorchScript Path": [[8, "inference-with-torchscript-path"]], "Install Intel\u00ae Extension for PyTorch*": [[32, "install-intel-extension-for-pytorch"]], "Installation": [[24, null]], "Intel CPU Structure": [[33, "intel-cpu-structure"]], "Intel OpenMP": [[33, "intel-openmp"]], "Intel OpenMP Library": [[31, "intel-openmp-library"]], "Intel\u00ae AI Reference Models": [[6, "intel-ai-reference-models"]], "Intel\u00ae Extension for PyTorch*": [[1, null]], "Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc": [[0, null]], "Intel\u00ae Extension for PyTorch* optimizations for quantization": [[15, null]], "Introduction": [[8, "introduction"], [19, "introduction"], [25, null]], "Jemalloc": [[31, "jemalloc"], [33, "jemalloc"]], "Kernel Stub: csrc/cpu/aten/xyz.cpp and csrc/cpu/aten/xyz.h": [[17, "kernel-stub-csrc-cpu-aten-xyz-cpp-and-csrc-cpu-aten-xyz-h"]], "Kernel implementation: csrc/cpu/aten/kernels/xyzKrnl.cpp": [[17, "kernel-implementation-csrc-cpu-aten-kernels-xyzkrnl-cpp"]], "Known Issues": [[34, "known-issues"], [34, "id20"], [34, "id26"], [34, "id34"]], "Known issue": [[9, "known-issue"], [34, "known-issue"], [34, "id47"]], "Known issues": [[20, "known-issues"], [34, "id41"]], "LLM Module Level Optimizations (Prototype)": [[2, "llm-module-level-optimizations-prototype"]], "LLM Optimizations Frontend API": [[29, null]], "LLM Performance": [[30, "llm-performance"]], "LLM Quick Start": [[23, "llm-quick-start"]], "Large Language Model (LLM)": [[6, "large-language-model-llm"]], "Large Language Models (LLM) Optimization Overview": [[28, null]], "Large Language Models (LLM, NEW feature from 2.1.0)": [[7, "large-language-models-llm-new-feature-from-2-1-0"]], "Launch Script Usage Guide": [[31, null]], "Launcher Core Pinning to Boost Performance of TorchServe Multi Worker Inference": [[32, "launcher-core-pinning-to-boost-performance-of-torchserve-multi-worker-inference"]], "Launcher Hyperparameters": [[14, "launcher-hyperparameters"]], "License": [[27, null]], "Linear Operator Optimization": [[28, "linear-operator-optimization"]], "Local linting": [[5, "local-linting"]], "Low Precision Data Types": [[28, "low-precision-data-types"]], "Memory Allocator": [[33, "memory-allocator"]], "Memory Format Is All That Matters": [[18, "memory-format-is-all-that-matters"]], "Methodology": [[13, "methodology"]], "Module Level Optimization API for customized LLM (Prototype)": [[28, "module-level-optimization-api-for-customized-llm-prototype"]], "Module uses forward method explicitly instead of the __call__ attr": [[10, "module-uses-forward-method-explicitly-instead-of-the-call-attr"]], "Motivation": [[10, "motivation"]], "Multiple instances for inference": [[31, "multiple-instances-for-inference"]], "NOTE": [[34, "note"]], "Non-Uniform Memory Access (NUMA)": [[33, "non-uniform-memory-access-numa"]], "Numactl": [[33, "numactl"]], "OMP_NUM_THREADS": [[33, "omp-num-threads"]], "OMP_THREAD_LIMIT": [[33, "omp-thread-limit"]], "OneDNN primitive cache": [[33, "onednn-primitive-cache"]], "Op Eligibility": [[8, "op-eligibility"]], "Op-Specific Behavior": [[8, "op-specific-behavior"]], "OpenMP": [[33, "openmp"]], "Operation Fusion": [[19, "operation-fusion"]], "Operator Optimization": [[7, "operator-optimization"]], "Ops that can autocast to bfloat16": [[8, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float32": [[8, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[8, "ops-that-promote-to-the-widest-input-type"]], "Optimization Methodologies": [[28, "optimization-methodologies"]], "Optimizer Fusion": [[19, null]], "Optimizer Optimization": [[7, "optimizer-optimization"]], "Others": [[34, "others"]], "Overview": [[17, "overview"], [30, "overview"], [31, "overview"], [33, "overview"]], "Performance": [[30, null], [34, "performance"]], "Performance Boost with Intel\u00ae Extension for PyTorch* and Launcher": [[32, "performance-boost-with-intel-extension-for-pytorch-and-launcher"]], "Performance Data for Intel\u00ae AI Data Center Products": [[30, "performance-data-for-intel-ai-data-center-products"]], "Performance Improvement": [[34, "performance-improvement"]], "Performance Numbers": [[30, "performance-numbers"], [30, "id1"], [30, "id4"]], "Performance Regression": [[26, "performance-regression"]], "Performance Result": [[34, "performance-result"]], "Performance Tuning Guide": [[33, null]], "Performance recipes": [[20, "performance-recipes"]], "Prepare Model": [[15, "prepare-model"]], "Prepare Model and Do Calibration": [[15, "prepare-model-and-do-calibration"]], "Prerequisite": [[11, "prerequisite"]], "Private Debug APIs": [[17, "private-debug-apis"]], "Pseudocode of Common Usage Scenarios": [[29, "pseudocode-of-common-usage-scenarios"]], "PyTorch Channels Last Memory Format APIs": [[18, "pytorch-channels-last-memory-format-apis"]], "PyTorch Strided Layout": [[18, "pytorch-strided-layout"]], "Python": [[6, "python"]], "Python Unit Testing": [[5, "python-unit-testing"]], "Quantization": [[2, "module-intel_extension_for_pytorch.quantization"]], "Quick Start": [[23, null]], "Releases": [[34, null]], "Requirements": [[20, "requirements"]], "ResNet50": [[32, "resnet50"]], "Resnet50": [[6, "resnet50"], [6, "id1"], [6, "id3"], [6, "id6"], [6, "id9"], [6, "id12"]], "Result Correctness": [[26, "result-correctness"]], "Runtime Extension": [[7, "runtime-extension"], [20, null], [26, "runtime-extension"]], "Scaling workers": [[32, "scaling-workers"]], "Select ISA level manually.": [[17, "select-isa-level-manually"]], "Serving model with Intel\u00ae Extension for PyTorch*": [[32, "serving-model-with-intel-extension-for-pytorch"]], "Single instance for inference": [[31, "single-instance-for-inference"]], "Smooth Quant Recipe Tuning API (Prototype)": [[22, null]], "Smooth Quantization Autotune": [[16, "smooth-quantization-autotune"]], "Smooth Quantization INT8": [[6, "smooth-quantization-int8"]], "SmoothQuant": [[29, "smoothquant"]], "Software Configuration": [[33, "software-configuration"]], "Software Version": [[30, "software-version"], [30, "id3"], [30, "id6"]], "Split SGD": [[21, null], [21, "id2"]], "Static Quantization": [[6, "static-quantization"], [15, "static-quantization"]], "Stochastic Gradient Descent (SGD)": [[21, "stochastic-gradient-descent-sgd"]], "Support": [[1, "support"]], "TCMalloc": [[31, "tcmalloc"], [33, "tcmalloc"]], "The origin command with ipex launch": [[10, "the-origin-command-with-ipex-launch"]], "Tips": [[5, "tips"]], "Tips and Debugging": [[5, "tips-and-debugging"]], "TorchDynamo": [[26, "torchdynamo"]], "TorchDynamo Mode (Beta, NEW feature from 2.0.0)": [[6, "torchdynamo-mode-beta-new-feature-from-2-0-0"], [6, "id11"]], "TorchScript Mode": [[6, "torchscript-mode"], [6, "id8"]], "TorchServe with Intel\u00ae Extension for PyTorch*": [[32, null]], "TorchServe with Launcher": [[32, "torchserve-with-launcher"]], "Training": [[6, "training"]], "Training Support": [[8, "training-support"]], "Troubleshooting": [[26, null]], "Unit testing": [[5, "unit-testing"]], "Usage Example": [[11, "usage-example"], [12, "usage-example"], [16, "usage-example"]], "Usage Examples": [[14, "usage-examples"], [31, "usage-examples"]], "Usage of Hypertune": [[14, "usage-of-hypertune"]], "Usage of Jemalloc/TCMalloc/Default memory allocator": [[31, "usage-of-jemalloc-tcmalloc-default-memory-allocator"]], "Usage of OpenMP library": [[31, "usage-of-openmp-library"]], "Usage of launch script": [[31, "usage-of-launch-script"]], "Use Case": [[8, "use-case"]], "Use Case not supported": [[10, "use-case-not-supported"]], "Use Cases": [[20, "use-cases"]], "User defined search space": [[14, "user-defined-search-space"]], "Using a fixed alpha": [[16, "using-a-fixed-alpha"]], "V. Throughput mode": [[31, "v-throughput-mode"]], "VI. Latency mode": [[31, "vi-latency-mode"]], "VII. Your designated number of instances": [[31, "vii-your-designated-number-of-instances"]], "VIII. Your designated number of instances and instance index": [[31, "viii-your-designated-number-of-instances-and-instance-index"]], "Vec specific kernel example:": [[17, "vec-specific-kernel-example"]], "Verified for distributed inference mode via DeepSpeed": [[28, "verified-for-distributed-inference-mode-via-deepspeed"]], "Verified for single instance mode": [[28, "verified-for-single-instance-mode"]], "Weight Only Quantization (WOQ)": [[29, "weight-only-quantization-woq"]], "Weight Only Quantization INT8/INT4": [[6, "weight-only-quantization-int8-int4"]], "What is Channels Last": [[18, "what-is-channels-last"]], "What\u2019s Changed": [[34, "what-s-changed"], [34, "id35"]], "What\u2019s New": [[34, "what-s-new"], [34, "id38"], [34, "id40"], [34, "id43"], [34, "id46"]], "Writing Channels Last Kernels": [[18, "writing-channels-last-kernels"]], "Writing documentation": [[5, "writing-documentation"]], "a. Create NHWC Memory": [[18, "a-create-nhwc-memory"]], "a. NCHW (default)": [[18, "a-nchw-default"]], "a. Status on CPU": [[18, "a-status-on-cpu"]], "a. tensor creation": [[18, "a-tensor-creation"]], "b. Create Convolution Primitive": [[18, "b-create-convolution-primitive"]], "b. NHWC (WIP for CPU)": [[18, "b-nhwc-wip-for-cpu"]], "b. Register Channels Last Kernel in ATen Native Manner": [[18, "b-register-channels-last-kernel-in-aten-native-manner"]], "b. tensor conversion": [[18, "b-tensor-conversion"]], "c. Blocked (nChw16c)": [[18, "c-blocked-nchw16c"]], "c. Register oneDNN Kernel on Channels Last": [[18, "c-register-onednn-kernel-on-channels-last"]], "c. model conversion": [[18, "c-model-conversion"]], "d. operator coverage": [[18, "d-operator-coverage"]], "default": [[9, "default"]], "disable": [[9, "disable"]], "enable": [[9, "enable"]], "ipex.llm Optimized Model List for Inference": [[28, "ipex-llm-optimized-model-list-for-inference"]], "oneDNN NHWC APIs": [[18, "onednn-nhwc-apis"]], "torch.compile (Beta, NEW feature from 2.0.0)": [[7, "torch-compile-beta-new-feature-from-2-0-0"]], "your_conf_file": [[14, "your-conf-file"]]}, "docnames": ["design_doc/cpu/isa_dyndisp", "index", "tutorials/api_doc", "tutorials/blogs_publications", "tutorials/cheat_sheet", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/amp", "tutorials/features/auto_channels_last", "tutorials/features/codeless_optimization", "tutorials/features/fast_bert", "tutorials/features/graph_capture", "tutorials/features/graph_optimization", "tutorials/features/hypertune", "tutorials/features/int8_overview", "tutorials/features/int8_recipe_tuning_api", "tutorials/features/isa_dynamic_dispatch", "tutorials/features/nhwc", "tutorials/features/optimizer_fusion", "tutorials/features/runtime_extension", "tutorials/features/split_sgd", "tutorials/features/sq_recipe_tuning_api", "tutorials/getting_started", "tutorials/installation", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/llm_optimize", "tutorials/performance", "tutorials/performance_tuning/launch_script", "tutorials/performance_tuning/torchserve", "tutorials/performance_tuning/tuning_guide", "tutorials/releases"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["design_doc/cpu/isa_dyndisp.md", "index.rst", "tutorials/api_doc.rst", "tutorials/blogs_publications.md", "tutorials/cheat_sheet.md", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/amp.md", "tutorials/features/auto_channels_last.md", "tutorials/features/codeless_optimization.md", "tutorials/features/fast_bert.md", "tutorials/features/graph_capture.md", "tutorials/features/graph_optimization.md", "tutorials/features/hypertune.md", "tutorials/features/int8_overview.md", "tutorials/features/int8_recipe_tuning_api.md", "tutorials/features/isa_dynamic_dispatch.md", "tutorials/features/nhwc.md", "tutorials/features/optimizer_fusion.md", "tutorials/features/runtime_extension.md", "tutorials/features/split_sgd.rst", "tutorials/features/sq_recipe_tuning_api.md", "tutorials/getting_started.md", "tutorials/installation.md", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/llm_optimize.md", "tutorials/performance.md", "tutorials/performance_tuning/launch_script.md", "tutorials/performance_tuning/torchserve.md", "tutorials/performance_tuning/tuning_guide.md", "tutorials/releases.md"], "indexentries": {"autotune() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.autotune", false]], "convert() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.convert", false]], "cpupool (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.CPUPool", false]], "enable_onednn_fusion() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.enable_onednn_fusion", false]], "fast_bert() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.fast_bert", false]], "fast_layer_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.fast_layer_norm", false]], "fastlayernorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.FastLayerNorm", false]], "frozenbatchnorm2d (class in intel_extension_for_pytorch.nn)": [[7, "intel_extension_for_pytorch.nn.FrozenBatchNorm2d", false]], "get_core_list_of_node_id() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.get_core_list_of_node_id", false]], "get_smooth_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_smooth_quant_qconfig_mapping", false]], "get_weight_only_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_weight_only_quant_qconfig_mapping", false]], "indirect_access_kv_cache_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.indirect_access_kv_cache_attention", false]], "indirectaccesskvcacheattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.IndirectAccessKVCacheAttention", false]], "intel_extension_for_pytorch": [[2, "module-intel_extension_for_pytorch", false]], "intel_extension_for_pytorch.cpu.runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime", false]], "intel_extension_for_pytorch.llm": [[2, "module-intel_extension_for_pytorch.llm", false]], "intel_extension_for_pytorch.llm.functional": [[2, "module-intel_extension_for_pytorch.llm.functional", false]], "intel_extension_for_pytorch.llm.modules": [[2, "module-intel_extension_for_pytorch.llm.modules", false]], "intel_extension_for_pytorch.quantization": [[2, "module-intel_extension_for_pytorch.quantization", false]], "interaction() (in module intel_extension_for_pytorch.nn.functional)": [[7, "intel_extension_for_pytorch.nn.functional.interaction", false]], "is_runtime_ext_enabled() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.is_runtime_ext_enabled", false]], "linear2silumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.Linear2SiluMul", false]], "linearadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAdd", false]], "linearaddadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAddAdd", false]], "lineargelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearGelu", false]], "linearmul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearMul", false]], "linearnewgelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearNewGelu", false]], "linearrelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearRelu", false]], "linearsilu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSilu", false]], "linearsilumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSiluMul", false]], "mergedembeddingbag (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBag", false]], "mergedembeddingbagwithsgd (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBagWithSGD", false]], "module": [[2, "module-intel_extension_for_pytorch", false], [2, "module-intel_extension_for_pytorch.cpu.runtime", false], [2, "module-intel_extension_for_pytorch.llm", false], [2, "module-intel_extension_for_pytorch.llm.functional", false], [2, "module-intel_extension_for_pytorch.llm.modules", false], [2, "module-intel_extension_for_pytorch.quantization", false]], "multistreammodule (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModule", false]], "multistreammodulehint (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModuleHint", false]], "optimize() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.optimize", false]], "optimize() (in module intel_extension_for_pytorch.llm)": [[2, "intel_extension_for_pytorch.llm.optimize", false]], "pagedattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.PagedAttention", false]], "pin (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.pin", false]], "prepare() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.prepare", false]], "rms_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rms_norm", false]], "rmsnorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RMSNorm", false]], "rotary_embedding() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rotary_embedding", false]], "rotaryembedding (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RotaryEmbedding", false]], "task (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.Task", false]], "varlen_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.varlen_attention", false]], "varlenattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.VarlenAttention", false]], "verbose (class in intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.verbose", false]]}, "objects": {"": [[2, 0, 0, "-", "intel_extension_for_pytorch"]], "intel_extension_for_pytorch": [[2, 2, 1, "", "enable_onednn_fusion"], [2, 2, 1, "", "fast_bert"], [2, 0, 0, "-", "llm"], [2, 2, 1, "", "optimize"], [2, 0, 0, "-", "quantization"], [2, 1, 1, "", "verbose"]], "intel_extension_for_pytorch.cpu": [[2, 0, 0, "-", "runtime"]], "intel_extension_for_pytorch.cpu.runtime": [[2, 1, 1, "", "CPUPool"], [2, 1, 1, "", "MultiStreamModule"], [2, 1, 1, "", "MultiStreamModuleHint"], [2, 1, 1, "", "Task"], [2, 2, 1, "", "get_core_list_of_node_id"], [2, 2, 1, "", "is_runtime_ext_enabled"], [2, 1, 1, "", "pin"]], "intel_extension_for_pytorch.llm": [[2, 0, 0, "-", "functional"], [2, 0, 0, "-", "modules"], [2, 2, 1, "", "optimize"]], "intel_extension_for_pytorch.llm.functional": [[2, 2, 1, "", "fast_layer_norm"], [2, 2, 1, "", "indirect_access_kv_cache_attention"], [2, 2, 1, "", "rms_norm"], [2, 2, 1, "", "rotary_embedding"], [2, 2, 1, "", "varlen_attention"]], "intel_extension_for_pytorch.llm.modules": [[2, 1, 1, "", "FastLayerNorm"], [2, 1, 1, "", "IndirectAccessKVCacheAttention"], [2, 1, 1, "", "Linear2SiluMul"], [2, 1, 1, "", "LinearAdd"], [2, 1, 1, "", "LinearAddAdd"], [2, 1, 1, "", "LinearGelu"], [2, 1, 1, "", "LinearMul"], [2, 1, 1, "", "LinearNewGelu"], [2, 1, 1, "", "LinearRelu"], [2, 1, 1, "", "LinearSilu"], [2, 1, 1, "", "LinearSiluMul"], [2, 1, 1, "", "PagedAttention"], [2, 1, 1, "", "RMSNorm"], [2, 1, 1, "", "RotaryEmbedding"], [2, 1, 1, "", "VarlenAttention"]], "intel_extension_for_pytorch.nn": [[7, 1, 1, "", "FrozenBatchNorm2d"]], "intel_extension_for_pytorch.nn.functional": [[7, 2, 1, "", "interaction"]], "intel_extension_for_pytorch.nn.modules": [[7, 1, 1, "", "MergedEmbeddingBag"], [7, 1, 1, "", "MergedEmbeddingBagWithSGD"]], "intel_extension_for_pytorch.quantization": [[2, 2, 1, "", "autotune"], [2, 2, 1, "", "convert"], [2, 2, 1, "", "get_smooth_quant_qconfig_mapping"], [2, 2, 1, "", "get_weight_only_quant_qconfig_mapping"], [2, 2, 1, "", "prepare"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:function"}, "terms": {"": [2, 3, 5, 8, 10, 14, 15, 18, 19, 20, 21, 22, 26, 31, 32, 33], "0": [1, 2, 4, 5, 8, 10, 11, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 30, 31, 32, 33], "00": [31, 34], "00000": 21, "00000000000602e7": 17, "0000012345": 21, "001": [6, 8], "0016": 30, "01": [2, 4, 7, 16, 31, 32, 34], "02": [30, 32], "02x": 30, "03": 32, "03x": 30, "04": [30, 31], "04x": 30, "05": [2, 7, 10, 30, 31], "05x": 30, "06": [2, 31, 32], "06x": 30, "07": 31, "07x": 30, "08": 31, "08x": 30, "09": [17, 31], "096": 32, "09864": 2, "09x": 30, "0x00007f3cde954000": 6, "0x00007f3ce16ac000": 6, "0x00007f3cf70fc000": 6, "0x00007f3cf985a000": 6, "0x00007f3cf98e0000": 6, "0x1": 17, "0x700001c": 30, "0x7fff": 17, "0xd0002a0": 30, "0xffff": 17, "1": [1, 2, 3, 4, 6, 8, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 33], "10": [7, 14, 16, 17, 18, 21, 25, 26, 31, 32, 33], "100": [2, 4, 14, 16, 17, 30, 32], "10000": 2, "1009": 30, "100mb": 34, "1024": [30, 33], "102b": 28, "1032": 34, "10438": 2, "1053": 34, "1074": 34, "10k": 6, "10x": 30, "11": [17, 31, 32], "111": 33, "112": [26, 30, 33, 34], "117": 31, "118": 31, "11b": [28, 34], "11x": 30, "12": [6, 10, 14, 17, 30, 31, 32], "1200": 30, "12345": 21, "1234500000": 21, "1234512345": 21, "125m": 6, "127": [6, 31, 34], "128": [6, 8, 10, 13, 20, 30, 34], "128k": [2, 28, 34], "128task": 30, "1295": 34, "12b": 28, "12x": 30, "13": [3, 10, 17, 30, 31, 32, 33], "1318": 34, "1322": 34, "1328": 34, "1330": 34, "1338": 34, "1341": 34, "1353": 34, "1355": 34, "1367": 34, "1373": 34, "1376": 34, "1384": 34, "1391": 34, "1392": 34, "13b": [28, 30, 34], "13x": 30, "14": [31, 32, 34], "140": 31, "1414": 34, "1419": 34, "143": 31, "146": 31, "1473": 34, "1488": 34, "149": 31, "14x": 30, "15": [14, 17, 30, 31, 32], "151": 31, "1513": 34, "1517": 34, "154": 31, "1563": 34, "1564": 34, "1566": 34, "1568": 34, "157": 31, "1580": 34, "1585": 34, "1587": 34, "1589": 34, "159": 31, "1590": 34, "1592": 34, "1593": 34, "1594": 34, "15x": 30, "16": [2, 17, 20, 21, 30, 31, 32], "160": 30, "162": 31, "164": 31, "1664": 34, "167": 31, "1677": 34, "1682": 34, "1688": 34, "1695": 34, "16gb": 30, "16x": 30, "16xlarg": 30, "17": [6, 30, 31, 32], "170": 30, "175": 31, "176": 31, "177": 31, "17th": 30, "18": [30, 31, 32], "18x": 30, "19": [7, 30, 31, 32, 34], "199": 30, "19x": 30, "1_6b": 28, "1b7": 28, "1d": 18, "1e": [2, 7, 10, 16], "1mb": 33, "2": [1, 2, 3, 8, 10, 11, 16, 17, 18, 20, 21, 25, 26, 27, 28, 29, 30, 31, 33], "20": [2, 7, 18, 30, 31, 32, 34], "2006080250": 30, "200m": 33, "2017": 3, "2019": 3, "2020": 3, "2021": [3, 17, 31, 32], "2022": [3, 31, 32], "2023": [2, 3, 30], "2024": 33, "2048": [2, 6], "205": 34, "20b": 28, "20x": 30, "21": [30, 31, 32], "2104": 2, "2105": 30, "2137": 34, "2195": 34, "2198": 34, "21x": 30, "22": [6, 30, 31, 32], "220m": 34, "220mb": 34, "2211": 2, "2229": 34, "223": 32, "2236": 34, "224": [6, 8, 10, 12, 13, 30, 32, 34], "224m": 34, "2251": 34, "2253": 34, "2257": 34, "2264": 34, "2275": 34, "2278": 34, "2280": 34, "2283": 34, "2290": 34, "2292": 34, "2299": 34, "23": [21, 31, 32], "2315": 34, "2317": 34, "2319": 34, "233": 31, "2334": 34, "2349": 34, "235": 31, "236": 31, "2392": 34, "24": [31, 32], "2412": 34, "2433": 34, "244": 13, "2468": 34, "2469": 34, "2473": 34, "2476": 34, "2480": 34, "2491": 34, "24x": 30, "24xlarg": 32, "25": [31, 32], "2511": 34, "2550": 34, "256": [2, 30], "2561": 34, "2568": 34, "256gb": 30, "2584": 34, "26": [30, 31, 32], "2613": 34, "2617": 34, "2627": 34, "2631": 34, "2641": 34, "2663": 34, "2666": 33, "2675": 34, "26x": 30, "27": [31, 32, 33], "2704": 34, "2733": 34, "274": 32, "2747": 34, "278": 34, "27x": 30, "28": [10, 14, 16, 30, 31, 32, 33, 34], "2883": 34, "29": [7, 31, 32], "2910": 34, "2911": 34, "2928": 34, "29500": [6, 31], "2985": 34, "2987": 34, "29x": 30, "2b": 28, "2d": 18, "2nd": 28, "2x": 34, "3": [2, 5, 6, 7, 8, 10, 12, 13, 14, 16, 17, 18, 20, 21, 28, 30, 31, 33], "30": [31, 32], "3030": 34, "305": 30, "3079": 34, "3080": 34, "30b": 28, "30ghz": 30, "30x": 30, "31": [31, 32], "3116": 34, "3143": 34, "31x": 30, "32": [2, 6, 18, 21, 23, 30, 31, 32], "3200": 30, "32x": 30, "32x16d": 30, "33": [17, 31, 32], "339081764221191": 14, "33x": 30, "34": [31, 32], "35": [31, 32], "355": 31, "356": 31, "35x": 30, "36": [30, 31, 32], "36x": 30, "37": [31, 32, 34], "38": [31, 32], "384": [10, 32, 34], "384task": 30, "38x": 30, "39": [30, 31, 32, 34], "39x": 30, "3b": 28, "3d": 34, "3e": [10, 34], "3rd": [3, 7, 21, 30, 34], "4": [2, 6, 11, 13, 14, 18, 20, 23, 28, 30, 31, 33], "40": [30, 31, 32, 34], "407": 34, "409": 26, "4096": [2, 33], "40b": 28, "40mb": 34, "41": [31, 32], "42": [31, 32], "425": 34, "43": [6, 11, 31, 32], "432": 34, "438": 34, "44": [30, 31, 32], "44x": 30, "45": [6, 31, 32], "452": 34, "45x": 30, "46": [31, 32], "47": [31, 32], "470": 31, "471": 31, "473": 31, "476": 31, "479": 31, "47x": 30, "48": [30, 31, 32], "48x": 30, "49": [30, 31, 32], "49786": 34, "4bit": 34, "4k": 28, "4th": [28, 30], "4x": 3, "5": [2, 6, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 26, 28, 30, 31, 32, 33, 34], "50": [18, 31, 32], "50ghz": 33, "51": [31, 32], "512": [1, 6, 11, 16, 25, 28, 31], "513": 31, "52": [31, 32], "524": 34, "53": [31, 32], "531": 34, "54": [31, 32], "55": [31, 32, 33], "551": 34, "55x": 30, "56": [30, 31, 32, 33], "57": 31, "57x": 30, "58": [17, 31], "589": 34, "58x": 30, "59": 31, "591": 31, "5d": 16, "5m": 34, "5mb": 34, "5rc3": 34, "5x": 34, "6": [2, 5, 7, 11, 14, 20, 30, 31, 32, 33, 34], "60": 31, "602": 34, "61": 31, "62": 31, "62x": 30, "63": [31, 34], "64": [2, 8, 10, 16, 20, 30, 31, 34], "642": 34, "647": 34, "648": 34, "64byte": 34, "64gb": 30, "65": 31, "654": 31, "655": 31, "65536": 33, "657": 34, "66": [17, 31, 34], "67": [30, 31, 34], "674": 34, "67x": 30, "68": [31, 34], "684": 34, "685": 34, "69": [30, 31], "692": 34, "6b": [2, 28, 30], "7": [10, 14, 17, 20, 21, 31, 32, 34], "70": 31, "70b": [28, 34], "71": 31, "711": 34, "71x": 30, "72": 31, "73": 31, "74": 31, "75": [30, 31], "75x": 30, "76": [30, 31], "760": [31, 32], "761": [31, 32], "762": 32, "763": 32, "764": 31, "768gb": 30, "77": 31, "77x": 30, "78": [30, 31], "784": 31, "787": 34, "78x": 30, "79": [30, 31], "7b": [6, 28, 30, 34], "7f": 16, "7m": 34, "7x": 34, "8": [14, 16, 30, 31, 32, 33], "80": [5, 30, 31], "81": [30, 31], "8180": 32, "8180m": [14, 33], "81x": 30, "82": 31, "822": 34, "83": [31, 33], "8375c": 32, "8380": 30, "8380h": 30, "83x": 30, "84": [6, 30, 31, 33], "85": [30, 31], "85x": 30, "86": [30, 31], "87": 31, "88": 31, "8b": 28, "8x": 18, "8x7b": 28, "9": [6, 7, 14, 17, 23, 25, 31, 32], "9000": 32, "9000000000": [31, 33], "9001": 32, "9002": 32, "9003": 32, "90ghz": 30, "92": 30, "93": 30, "96": 30, "96x": 30, "97": 30, "975": 32, "98": 30, "981": 32, "982": 32, "99": [16, 30, 34], "992": 34, "A": [2, 5, 6, 7, 10, 11, 17, 26, 28, 31, 33, 34], "And": [15, 20, 32, 34], "As": [10, 19, 20, 28, 31, 32, 33, 34], "At": [7, 17], "But": [17, 18], "By": [17, 31, 33], "For": [1, 2, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 31, 32, 33, 34], "If": [2, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 20, 26, 31, 32, 33, 34], "In": [1, 2, 6, 7, 8, 12, 16, 17, 18, 19, 21, 23, 28, 31, 32, 33, 34], "It": [2, 6, 7, 8, 10, 13, 17, 18, 20, 21, 23, 26, 29, 31, 33, 34], "Its": 28, "NOT": [18, 31], "No": [2, 18, 34], "Not": 2, "ON": 30, "On": [1, 2, 7, 18, 28, 33], "One": [2, 3, 18, 19, 31, 33], "Such": 17, "The": [0, 1, 2, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "Then": 32, "There": [14, 16, 20, 33, 34], "These": [1, 5, 6, 7, 8, 13, 28], "To": [2, 5, 6, 7, 10, 13, 15, 16, 17, 18, 20, 21, 23, 28, 32, 33, 34], "Will": [6, 18], "With": [1, 2, 7, 10, 20, 31, 34], "_": [13, 15, 16, 17, 18, 20, 30, 31, 32, 33, 34], "___": 13, "_____": 13, "__init__": [5, 6, 8, 10, 16, 20, 26, 34], "__m256i": 17, "__m512": 17, "__m512i": 17, "__main__": [26, 31, 32, 34], "__name__": [26, 34], "_appli": 18, "_build": 5, "_c": [17, 26], "_cmp_ord_q": 17, "_core": 31, "_cvt_fp32_to_bf16": 17, "_get_current_isa_level": 17, "_get_highest_binary_support_isa_level": 17, "_get_highest_cpu_support_isa_level": 17, "_jit_set_texpr_fuser_en": 26, "_lu_with_info": 8, "_mm256_mask_storeu_epi16": 17, "_mm256_storeu_si256": 17, "_mm512_add_epi32": 17, "_mm512_and_si512": 17, "_mm512_castps_si512": 17, "_mm512_cmp_ps_mask": 17, "_mm512_cvtneps_pbh": 17, "_mm512_cvtusepi32_epi16": 17, "_mm512_loadu_p": 17, "_mm512_mask_blend_epi32": 17, "_mm512_maskz_loadu_p": 17, "_mm512_set1_epi32": 17, "_mm512_srli_epi32": 17, "_native_multi_head_attent": 8, "_reorder_cach": 2, "_sym": 2, "_timestamp_inst": 31, "_timestamp_instance_": 31, "ab": [13, 32], "abi": [6, 17, 34], "abil": 16, "abl": 15, "abnorm": [26, 34], "about": [1, 2, 5, 7, 13, 16, 32, 33, 34], "abov": [2, 5, 10, 19, 28, 30, 31, 32], "absolut": [2, 31], "abstract": [2, 11, 20], "acceler": [1, 2, 3, 6, 7, 13, 28, 29, 30, 34], "accept": [2, 34], "access": [2, 6, 7, 18, 19, 32, 34], "accommod": 18, "accompani": 34, "accord": [2, 13, 28, 33, 34], "accordingli": 16, "account": 6, "accu": 16, "accumul": 2, "accur": 8, "accuraci": [2, 3, 6, 7, 8, 15, 16, 21, 22, 26, 28, 34], "accuracy_criterion": [2, 4, 16, 34], "accuracy_criterion_typ": 2, "accuracy_criterion_valu": 2, "achang": 15, "achiev": [1, 2, 6, 7, 28, 33, 34], "across": 16, "act": 34, "act_ic_observ": 2, "act_observ": 2, "act_quant_mod": 2, "action": [6, 23], "activ": [2, 6, 7, 15, 16, 20, 28, 31, 33], "actual": [18, 21], "acycl": 13, "ad": [2, 7, 10, 33, 34], "adagrad": [19, 21], "adagrad_fused_step": 19, "adagrad_step": 19, "adam": 34, "adapt": 7, "adaptive_avg_pool3d": 8, "adaptive_max_pool3d": 8, "adaptiveaveragepoolingkrnl": 17, "add": [2, 5, 7, 8, 13, 14, 19, 21, 32, 34], "add_": 19, "add_argu": [6, 23], "add_casual_mask": 2, "add_execut": 6, "add_help": [6, 23], "addbmm": 8, "addcdiv_": 19, "addcmul_": 19, "addit": [2, 6, 7, 17, 21, 28, 34], "addition": 32, "addlayernorm": 34, "addmm": 8, "addmm_": 8, "addr": 31, "address": [7, 18, 31, 32, 33, 34], "addtion": 17, "adjust": 16, "adopt": [28, 34], "advanc": [1, 2, 6, 7, 16, 25, 28], "advantag": [1, 2, 7, 9, 12, 18, 21, 25, 30, 31, 33], "aes_ni": 17, "affect": [2, 31], "affin": [7, 10, 15, 20, 31, 32, 33], "affinit": 32, "after": [2, 5, 7, 13, 20, 21, 23, 24, 32, 33, 34], "afterward": [31, 33], "ag": 7, "again": [5, 19, 32], "against": 6, "agre": 5, "ahead": 5, "ai": [1, 2, 3, 7, 28], "aim": [7, 10, 16, 33], "aka": [7, 18], "albert": 34, "algorithm": [2, 13, 18, 30, 34], "alia": 2, "alibi": 2, "alibi_slop": 2, "align": [17, 18, 21, 34], "aliv": 32, "all": [2, 5, 6, 8, 13, 14, 17, 19, 20, 28, 29, 32, 33, 34], "all_logical_cor": 14, "all_physical_cor": 14, "allcat": 2, "allenai": 26, "alloc": [2, 10, 20, 28, 30, 32, 34], "allow": [2, 8, 14, 16, 22, 33, 34], "allreduc": 2, "almost": 18, "along": [2, 5, 6, 21, 33, 34], "alpha": [2, 6, 19, 22], "alpha_max": [16, 22], "alpha_min": [16, 22], "alpha_step": [16, 22], "alphafold2": 34, "alreadi": [1, 5, 6, 18, 28, 33], "also": [1, 2, 6, 7, 10, 13, 14, 16, 18, 19, 28, 30, 31, 33, 34], "altern": [2, 6, 18], "although": [2, 33], "alwai": [5, 6, 7, 8, 18, 31, 33, 34], "amazon": 32, "among": [2, 31, 32, 33], "amount": [2, 16, 26, 28, 33], "amp": [4, 6, 10, 23, 26, 34], "amp_dtyp": [6, 23], "amp_en": [6, 23], "ampconf": 34, "amplifi": 1, "amx": [1, 3, 6, 7, 17, 25, 28, 30], "amx_bf16": 17, "amx_int8": 17, "amx_til": 17, "an": [1, 2, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 26, 31, 32, 33, 34], "anaconda": 17, "analysi": 33, "ani": [2, 5, 8, 10, 17, 18, 32, 34], "announc": 34, "anonym": 17, "anoth": [14, 31, 33, 34], "answer": [18, 30], "anymor": [7, 34], "anyplac": 4, "ao": [2, 6, 15], "apach": [27, 32], "api": [1, 3, 6, 10, 11, 15, 20, 26, 33, 34], "app": [6, 34], "append": [6, 7], "append_torchlib_if_found": 6, "appli": [2, 6, 7, 8, 12, 13, 16, 18, 19, 21, 23, 26, 28, 29, 31, 34], "applic": [1, 2, 7, 20, 28, 32, 33], "apply_funct": 2, "appropri": 33, "apr": 3, "ar": [1, 2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "arang": [2, 6, 16], "arbitrari": 2, "arc": 3, "architectur": [2, 28, 30, 33], "area": [7, 14], "aren": 5, "arg": [2, 4, 6, 7, 14, 16, 19, 23, 31, 32, 34], "argc": 6, "argmax": 16, "argpars": [6, 23], "argument": [2, 6, 7, 22, 26, 31], "argumentpars": [6, 23], "argv": 6, "around": 31, "arrai": 18, "articl": [30, 33], "arxiv": 2, "ask": 5, "assign": [18, 31, 32, 33], "assum": [2, 7, 8, 23, 32, 33, 34], "asu": 33, "asymmetr": 2, "async": [20, 34], "asynchron": [2, 7], "aten": [2, 6, 7, 34], "aten_cpu_cap": 17, "attach": 33, "attent": [1, 2, 7, 28, 34], "attention_mask": [2, 6], "attention_mask_pad": 6, "attn_output": 2, "attn_weight": 2, "attribut": 18, "aug": [3, 30], "auto": [2, 6, 10, 17, 18, 22, 23, 26, 28, 31, 33, 34], "auto_alpha_arg": 16, "auto_ipex": 34, "auto_kernel_select": [2, 7, 30], "autocast": [4, 6, 7, 10, 23, 34], "autoclass": 5, "autoconfig": [6, 23], "autofunct": 5, "autom": [4, 7, 8, 14, 31, 32, 34], "automat": [1, 2, 6, 7, 9, 10, 12, 13, 15, 16, 18, 22, 28, 31, 32, 33, 34], "automaticlli": 2, "automixprecis": 34, "automodelforcausallm": [6, 23, 29, 34], "autotoken": [6, 23], "autotp": 28, "autotun": [2, 4, 22, 34], "avaiabl": 2, "avail": [1, 2, 6, 7, 11, 17, 20, 22, 23, 29, 31, 33, 34], "avg_pool3d": 8, "avoid": [2, 10, 20, 21, 26, 31, 32, 33, 34], "avx": [1, 6, 17, 25, 28], "avx2": [17, 26, 34], "avx256": 17, "avx2_vnni": 17, "avx512": [7, 17, 18, 32, 34], "avx512_4fmap": 17, "avx512_4vnniw": 17, "avx512_bf16": 17, "avx512_bitalg": 17, "avx512_bw": 17, "avx512_cd": 17, "avx512_core_vnni": 34, "avx512_dq": 17, "avx512_er": 17, "avx512_f": 17, "avx512_fp16": 17, "avx512_ifma": 17, "avx512_pf": 17, "avx512_vbmi": 17, "avx512_vbmi2": 17, "avx512_vl": 17, "avx512_vnni": 17, "avx512_vp2intersect": 17, "avx512_vpclmul": 17, "avx512_vpopcntdq": 17, "avx_vnni": 17, "awar": [18, 20, 31, 32], "awq": [2, 34], "b": [7, 8, 16, 28], "back": [6, 12, 17, 18, 21, 26], "backbon": 2, "backend": [1, 2, 3, 6, 7, 12, 13, 16, 17, 23, 26, 28, 31, 33, 34], "background": 33, "background_thread": [31, 33], "backpropag": 16, "backward": [6, 7, 8, 16, 21, 33, 34], "bactchnorm": 34, "baddbmm": 8, "bag": [26, 34], "baichuan": [2, 28, 34], "baichuan2": [28, 34], "bake": 34, "balanc": [7, 16, 22, 33], "bandwidth": 28, "base": [1, 2, 3, 4, 5, 6, 7, 10, 11, 17, 20, 21, 26, 28, 29, 30, 32, 33, 34], "base_dir": 29, "base_text_classif": 30, "baselin": [16, 22, 34], "basic": [2, 4, 16, 21, 33, 34], "batch": [2, 6, 7, 13, 16, 18, 20, 23, 26, 30, 32, 34], "batch_decod": [6, 23], "batch_id": 6, "batch_idx": [6, 13], "batch_siz": [2, 6, 11, 13, 16, 18, 23, 32], "batchnorm": [13, 17, 18, 26, 34], "batchnorm2d": [7, 10, 26, 34], "batchsiz": [2, 20], "beam": [2, 28], "beam_idx": 2, "beam_idx_tmp": 6, "beam_width": 28, "becam": 34, "becaus": [8, 17, 18, 21, 28, 33, 34], "becom": [7, 28, 33], "been": [0, 1, 6, 7, 10, 17, 18, 28, 31, 33, 34], "beeter": 28, "befor": [1, 2, 5, 6, 13, 14, 17, 18, 20, 31, 33, 34], "begin": 5, "beginn": 16, "behavior": [2, 20, 31, 33], "behaviour": 10, "being": [7, 33], "believ": [8, 18], "below": [6, 8, 10, 14, 19, 20, 21, 22, 23, 26, 28, 31, 32, 33, 34], "bench": 32, "benchmark": [6, 26, 30, 31, 34], "benefici": 18, "benefit": [6, 7, 8, 10, 20, 21, 28, 32, 33, 34], "benifit": 2, "bert": [3, 4, 10, 30, 34], "bert_int8_jit": 32, "bert_ipex_int8": 32, "bertmodel": [4, 6, 11, 32], "bertmodelmodel": 4, "besid": [28, 33, 34], "best": [2, 6, 7, 8, 14, 16, 17, 22, 24, 28, 33, 34], "beta": [23, 26], "better": [1, 2, 6, 7, 15, 18, 20, 28, 31, 32, 33, 34], "between": [7, 8, 17, 20, 33, 34], "beyond": 7, "bf16": [2, 3, 7, 17, 19, 21, 23, 26, 28, 30, 34], "bf16_gw": 21, "bf16_w": 21, "bfloat16": [2, 3, 4, 7, 10, 11, 17, 18, 23, 29, 31, 34], "bfp16": 34, "bia": [2, 8, 20, 34], "big": [7, 18], "bigcod": 28, "bigscienc": 28, "bin": [5, 6, 17, 31, 32], "binari": [5, 6, 7, 8, 17, 34], "binary_cross_entropi": 8, "binary_cross_entropy_with_logit": 8, "bind": [6, 7, 31, 32, 33, 34], "bio": 30, "bit": [21, 28], "blob": 2, "block": [2, 5, 16, 20, 22, 28, 33, 34], "block_numb": 2, "block_siz": 2, "block_tabl": 2, "blocktim": 31, "blockwis": 16, "blog": [2, 34], "bloom": [2, 28], "bmm": [8, 34], "bmp": 18, "bn": [2, 10, 15, 26, 34], "bn_fold": 2, "bodi": 17, "bool": [2, 14], "boolean": [7, 34], "booltensor": 7, "boost": [3, 6, 7, 9, 21, 30, 31, 33, 34], "both": [1, 2, 6, 7, 16, 18, 19, 21, 28, 29, 31, 32, 33, 34], "bother": 16, "bottl": 19, "bottleneck": [2, 28], "bottom": 21, "bound": [19, 20, 28, 33], "box": [6, 10, 33], "branch": [1, 7, 30], "break": [6, 16, 34], "brew": 5, "brief": [18, 28, 34], "briefli": 33, "bring": [2, 6, 7, 9, 15, 16, 21, 28, 31, 33, 34], "broad": [7, 9, 34], "broader": 34, "brought": [33, 34], "buffer": [2, 28], "bug": [1, 5, 34], "bui": 21, "build": [6, 28, 33, 34], "built": [7, 17, 20, 34], "busi": 33, "c": [1, 7, 8, 16, 17, 20, 26, 28, 31, 32, 33, 34], "c1": 20, "c10": [6, 17], "c620": 33, "cach": [2, 5, 7, 19, 20, 30, 34], "cache_weight_for_large_batch": 2, "caff": 3, "calcul": [1, 2, 8, 16, 21, 22], "cali_dataset": 34, "calib_dataload": [2, 6, 16, 34], "calib_dataset": [6, 29], "calib_evalu": 6, "calib_func": 2, "calib_sampl": 29, "calibr": [2, 13, 22, 26, 29, 30, 32, 34], "calibrated_model": 34, "calibration_data_load": [4, 6, 13], "calibration_data_set": [15, 34], "calibration_model": 29, "calibration_sampl": 6, "call": [2, 6, 8, 13, 17, 18, 21, 26, 32, 33, 34], "caller": [26, 34], "can": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 28, 29, 30, 31, 32, 33, 34], "cannot": [8, 19, 26, 31, 34], "canon": 18, "capabl": [3, 17, 34], "capac": [21, 30], "captur": [4, 34], "card": 18, "care": 32, "carri": 30, "case": [2, 6, 7, 9, 12, 16, 17, 18, 28, 31, 33, 34], "cases": 32, "cast": [2, 8, 21, 28], "casual": 26, "cat": [8, 31, 32, 34], "catch": 6, "categor": 7, "categori": [8, 34], "caus": [2, 7, 21, 26, 28, 31, 33, 34], "causal": 2, "cc": [5, 6, 17], "ccl": [6, 31, 34], "cd": [5, 6], "cdist": 8, "center": 34, "cento": 30, "cerr": 6, "certain": [1, 7, 26, 28, 29, 31, 33], "ch_axi": 2, "chain": 21, "chang": [2, 5, 6, 7, 8, 10, 11, 12, 15, 17, 18, 20, 23, 25, 26, 29, 31], "changed_onli": 5, "changelog": 34, "channel": [2, 3, 10, 15, 16, 26, 34], "channels_last": [6, 7, 18, 23, 33, 34], "char": 6, "charact": 5, "chat": 28, "chatglm": [2, 28], "chatglm2": [28, 34], "chatglm3": [28, 34], "cheat": 23, "check": [2, 5, 6, 7, 13, 18, 28, 29, 31, 34], "check_trac": [6, 13, 32], "checkpoint": [2, 6, 29], "checkpoints_json": 29, "chip": 33, "chipset": 33, "choic": [6, 21, 23, 31], "choleski": 8, "cholesky_invers": 8, "cholesky_solv": 8, "choos": [6, 8, 20, 23, 31, 33, 34], "chosen": [8, 14, 17], "chw": 18, "chwn": 18, "ci": 5, "cifar10": [6, 13], "circumst": 8, "clamp": 13, "clang": 5, "class": [2, 5, 6, 7, 8, 10, 16, 20, 26, 34], "classif": [26, 30], "claus": [7, 10, 19], "clean": 5, "clear": 10, "clibrat": 34, "click": 3, "clone": 5, "close": [18, 31, 33], "cloud": 3, "clr": 19, "cmake": [5, 6, 17, 34], "cmake_minimum_requir": 6, "cmakefil": 17, "cmakelint": 5, "cmakelist": 6, "cnn": [7, 18, 26, 30, 33, 34], "co": [2, 34], "coco": 30, "code": [1, 2, 5, 6, 7, 10, 11, 12, 13, 18, 19, 21, 23, 24, 26, 27, 29, 33, 34], "codegen": [2, 28, 34], "codeless": 31, "codellama": 28, "codenam": 34, "collabor": 3, "collate_batch": 6, "collate_fn": 6, "collect": [6, 32, 33, 34], "column": 6, "com": [2, 5, 34], "combin": [2, 12, 14, 28, 31, 34], "come": 33, "comma": 33, "command": [4, 5, 6, 14, 23, 31, 32, 33, 34], "comment": [5, 14, 17, 22, 34], "commit": 5, "common": [17, 21, 28, 31, 33], "commonli": [7, 28, 33, 34], "commun": [6, 28, 31, 32, 33, 34], "communication_backend_nam": 29, "compact": [31, 32, 33], "compar": [1, 2, 7, 13, 18, 21, 26, 28, 30, 31, 33, 34], "compat": [17, 21], "compet": 33, "competit": 33, "compil": [1, 5, 6, 23, 26, 33, 34], "complet": [5, 6, 14, 18, 22, 29, 33], "complex": 17, "complexdoubl": 17, "complexfloat": 17, "complic": [26, 31, 33], "complier": 17, "compon": [15, 26, 27, 28], "compos": [6, 13], "comprehens": [1, 34], "compressor": [3, 7, 16, 22, 34], "compris": 18, "compuat": 13, "comput": [2, 6, 7, 13, 15, 16, 18, 20, 21, 28, 30, 31, 32, 33, 34], "concat": [2, 20, 26, 28, 34], "concat_fp32_from_bf16": 21, "concat_linear": 2, "concat_output": 2, "concaten": [2, 21], "concept": [18, 33], "concern": 7, "conclud": [30, 34], "conclus": 18, "concurr": [32, 33], "conda": [5, 33], "conda_prefix": [31, 32], "condit": 27, "conduct": 7, "conf": [4, 13, 14, 31, 34], "conf_fil": [14, 34], "confer": 3, "config": [2, 6, 11, 23, 31, 32], "configur": [2, 4, 6, 7, 14, 15, 16, 17, 31, 32, 34], "confirm": 31, "conflict": [7, 17], "connect": 33, "consecut": 33, "consider": 16, "consist": [16, 28, 33, 34], "const": [6, 17], "constant": 13, "constraint": [2, 34], "construct": [2, 7, 13], "consum": [7, 14], "consumpt": 34, "contain": [2, 5, 6, 13, 17, 26, 31, 32, 33, 34], "containeraliasingtest": 5, "content": [29, 34], "context": [2, 5, 6, 8, 20, 28, 33, 34], "context_len": 2, "contigu": [6, 13, 18, 33, 34], "contiguous_format": [18, 33], "continu": [31, 32, 34], "contribut": [28, 31, 34], "control": [1, 2, 7, 20, 26, 31, 33, 34], "conv": [2, 8, 10, 13, 15, 20, 26, 34], "conv1d": [8, 13], "conv2": 20, "conv2d": [2, 7, 8, 10, 13, 18, 20, 26, 34], "conv3d": [8, 13, 34], "conv_bn": 2, "conv_bn_fold": [2, 26, 34], "conv_tbc": 8, "conv_transpose1d": 8, "conv_transpose2d": 8, "conv_transpose3d": 8, "conveni": [8, 34], "convers": [2, 8, 13, 34], "convert": [1, 2, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23, 26, 32, 34], "convert_model": [4, 13, 15, 16], "converted_model": [4, 6, 26, 34], "convolut": [2, 6, 7, 13, 20, 33, 34], "convolution1d": 34, "convolutuon": 2, "convrelu": 13, "convsumrelu": 13, "convtranspose2d": [2, 13], "convtranspose3d": 13, "coo": 18, "cooper": [7, 30, 34], "copi": [5, 17, 18], "copyright": [17, 27], "core": [2, 7, 14, 17, 30, 33, 34], "core_id": [2, 20, 31], "correct": [7, 18, 25, 34], "correspond": [20, 31, 34], "cosine_embedding_loss": 8, "cost": [2, 6, 28, 30, 33], "costli": 33, "could": [7, 13, 16, 18, 26, 32, 33, 34], "count": 31, "counterpart": [2, 7, 18, 34], "coupl": [20, 33, 34], "cout": 6, "cover": [13, 18, 31], "cpp": [5, 6, 33], "cppsdk": 34, "cpu": [1, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 19, 20, 23, 25, 26, 28, 30, 31, 32, 34], "cpu_capability_avx512": 17, "cpu_capability_avx512_bf16": 17, "cpu_featur": 17, "cpu_feature_main": 17, "cpu_launcher_arg": 32, "cpu_launcher_en": 32, "cpu_pool": [2, 20, 34], "cpu_pool1": 20, "cpu_pool2": 20, "cpuid": 17, "cpuinfo": 17, "cpunodebind": 33, "cpupool": [2, 20, 34], "crash": [31, 33, 34], "creat": [7, 16, 20, 33, 34], "creation": 2, "creator": 34, "credit": 17, "criteria": 16, "criterion": [6, 8, 16, 22], "cross": [32, 33, 34], "cross_entropy_loss": 8, "crossentropyloss": [6, 16], "csrc": 26, "csv": 14, "ctc_loss": 8, "cu": 5, "cu_seqlens_kv": 2, "cu_seqlens_q": 2, "cudnn": 18, "current": [1, 2, 5, 7, 11, 13, 14, 15, 16, 17, 19, 20, 26, 28, 29, 34], "current_posit": 2, "custom": [1, 2, 7, 26, 34], "customized_forward": 10, "cv": 34, "cvt_fp32_to_bf16": 17, "cvt_fp32_to_bf16_kernel_fn": 17, "cvt_fp32_to_bf16_kernel_impl": 17, "cvt_fp32_to_bf16_kernel_stub": 17, "cvtfp32tobf16": 17, "cvtfp32tobf16krnl": 17, "cxx": [6, 17], "cxx11": 34, "cxx_standard": 6, "d": [4, 5, 6, 7, 8, 13, 26, 28, 34], "d8": 33, "d__avx512f__": 17, "d__avx__": 17, "dag": 13, "daili": 34, "data": [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23, 26, 31, 32, 34], "data_typ": 18, "databrick": 28, "dataload": [2, 6, 10, 13, 16, 20, 22, 29, 34], "dataset": [6, 13, 16, 29, 30, 33, 34], "dataset_nam": [10, 34], "datatyp": [20, 34], "date": 34, "dcmake_prefix_path": 6, "dcpmm": 30, "dcpu_cap": 17, "dcpu_capability_amx": 17, "dcpu_capability_avx2": 17, "dcpu_capability_avx512": 17, "dcpu_capability_avx512_bf16": 17, "dcpu_capability_avx512_fp16": 17, "dcpu_capability_avx512_vnni": 17, "dcpu_capability_default": 17, "ddp": [2, 6], "ddr": 30, "ddr4": 33, "dealloc": 33, "debug": [2, 31], "debug_squad": [10, 34], "dec": 3, "decai": 7, "decid": [2, 15, 20, 28], "decim": 21, "declar": 17, "decltyp": 17, "decod": [2, 28, 30, 34], "deconv3d": 34, "decor": 2, "dedic": [2, 6, 28, 34], "deduct": 31, "deep": [3, 7, 8, 11, 13, 14, 21, 33], "deepcopi": 2, "deepspe": [2, 34], "def": [2, 6, 8, 10, 16, 20, 26, 34], "default": [2, 4, 6, 7, 10, 12, 13, 15, 16, 17, 20, 22, 23, 26, 28, 30, 32, 33, 34], "default_dynamic_qconfig": [15, 32], "default_dynamic_qconfig_map": 6, "default_dynamic_qconfigprepared_model": 4, "default_static_qconfig": [13, 15, 32, 34], "default_static_qconfig_map": 6, "default_static_qconfigprepared_model": 4, "defin": [2, 5, 6, 7, 8, 10, 16, 17, 18, 22, 32], "definit": [17, 21, 34], "deinit": 5, "deliv": [7, 28, 34], "demand": [2, 7], "demonstr": [6, 18, 26, 32], "demostr": 23, "denomin": 2, "denot": 21, "dens": [7, 18], "dep": 34, "depend": [5, 7, 17, 18, 25, 26, 33, 34], "deploi": 34, "deploy": [2, 7, 13, 34], "deployment_mod": [2, 6, 23], "deprec": [3, 26], "dequant": [13, 16], "desc": 18, "describ": [8, 13, 18, 21, 32, 33], "descript": [4, 7, 16, 18, 20, 25, 33, 34], "descriptor": 34, "design": [2, 5, 8, 18, 21, 29, 34], "desir": [16, 31], "destroy_process_group": 6, "destruct": 33, "detail": [2, 5, 6, 7, 8, 9, 11, 13, 17, 18, 24, 25, 26, 28, 30, 32, 33, 34], "detect": [1, 6, 12, 17, 26, 33, 34], "detectron2": 18, "determin": [2, 6, 17, 21, 33], "develop": [1, 3, 6, 28, 30, 33, 34], "devic": [1, 2, 15, 29, 31, 34], "device_nam": [7, 8], "diagram": [18, 33], "dict": [2, 6, 23], "dictionari": 34, "did": [33, 34], "didn": 20, "differ": [1, 2, 7, 15, 16, 17, 18, 20, 28, 31, 32, 33, 34], "difficult": 18, "difficulti": 16, "diffus": [3, 34], "digit": 21, "dim": [2, 6, 18, 23], "dimens": [2, 18, 26], "dinner": [6, 23], "dir": [17, 31], "direct": [2, 5, 13], "directli": [2, 6, 33, 34], "directori": [1, 5, 6, 14, 29, 31, 32], "dirty_decay_m": [31, 33], "disabl": [2, 6, 7, 13, 26, 31, 33, 34], "disable_auto_channels_last": 9, "disable_iomp": [14, 32], "disable_numactl": [14, 32], "disadvantag": 21, "discret": 1, "discrete gpu": 1, "discuss": [5, 18, 33], "dispatch": [1, 34], "dist": 6, "dist_sampl": 6, "distilbert": 30, "distribut": [2, 3, 7, 16, 31, 32, 33, 34], "distributeddataparallel": [6, 34], "distributedsampl": 6, "div": 13, "divid": [2, 13, 31, 32, 33, 34], "divis": [2, 20], "divisor": [2, 20], "dl": [3, 7, 34], "dlopen": 20, "dlrm": [3, 7, 26, 30, 34], "dnnl": 30, "dnnl_verbos": 2, "do": [2, 5, 8, 16, 18, 20, 21, 26, 28, 30, 31, 32, 33, 34], "do_ev": [10, 34], "do_sampl": [6, 23], "doc": [1, 2, 5, 11, 29, 34], "doc_strid": [10, 34], "docker": [30, 34], "dockerfil": 34, "dockerhub": 34, "docstr": 5, "document": [0, 7, 17, 20, 29, 34], "doe": [2, 7, 13, 18, 20, 26, 34], "doesn": [2, 15, 16, 18, 26, 34], "dolli": [28, 34], "domin": [1, 7, 28], "don": [2, 5, 8, 14, 17, 34], "done": [6, 10, 16, 17, 26, 33, 34], "dot": [2, 7, 18, 28], "doubl": 17, "down": [5, 32, 34], "download": [6, 13, 16], "downstream": 8, "dpc": 1, "dpcpp": 34, "dram": 2, "dramat": [32, 33], "drawback": [2, 21], "drive": [1, 7, 28], "driven": 2, "drop": [31, 32], "dropout": [2, 10], "dst": 17, "dtype": [2, 4, 6, 7, 8, 10, 11, 13, 15, 16, 17, 23, 26, 29, 31, 34], "due": [1, 8, 10, 17, 20, 26], "dummi": 32, "dummy_tensor": 32, "dummymodul": 10, "dump": [2, 31], "durat": [2, 21], "dure": [4, 6, 7, 10, 13, 16, 21, 31, 33, 34], "dynam": [1, 4, 20, 28, 32, 33, 34], "dynamic_qconfig": 15, "dynamic_quantized_model": 6, "e": [1, 2, 6, 7, 8, 12, 16, 17, 18, 28, 31, 33, 34], "each": [2, 8, 14, 16, 17, 19, 20, 21, 31, 32, 33, 34], "eager": [1, 7, 12, 23, 32, 34], "earli": [2, 34], "earlier": 21, "eas": [7, 18, 34], "easi": [1, 3, 21], "easier": [2, 18, 21], "easili": [10, 15], "ec2": 32, "edit": [5, 26, 34], "effect": [2, 17, 21, 26, 32, 33], "effici": [1, 7, 11, 19, 20, 28, 31, 33, 34], "effort": 34, "eig": 8, "einsum": 34, "either": [2, 26, 31], "el8_4": 30, "elaps": 33, "element": [2, 18, 19], "eleutherai": [2, 28], "elif": 6, "elimin": 28, "els": [6, 14, 17, 18, 23], "elser": 34, "eltwis": 34, "elu": 13, "emb": 7, "emb1": 7, "emb2": 7, "emb3": 7, "emb_m": 7, "embed": [2, 7, 28, 34], "embedding_bag": 10, "embedding_spec": 7, "embeddingbad": 34, "embeddingbag": [7, 26, 34], "embeddingspec": 7, "embedingbag": 7, "emblist": 7, "emerg": [1, 7, 28], "emphas": 33, "emply_lik": 2, "empow": 3, "empti": [18, 31], "enabl": [1, 2, 3, 4, 6, 7, 8, 10, 13, 16, 18, 20, 22, 23, 26, 28, 31, 32, 33, 34], "enable_auto_channels_last": 9, "enable_auto_mix_precis": 34, "enable_auto_mixed_precis": 34, "enable_auto_optim": 34, "enable_blockwise_loss": [16, 22], "enable_jemalloc": 32, "enable_onednn_fus": [2, 13], "enable_tcmalloc": 32, "encod": 34, "encount": [26, 34], "encourag": 34, "end": [6, 13, 20, 34], "endif": 17, "endl": 6, "engin": [1, 6, 18, 33], "enhanc": [1, 3, 28, 34], "enough": [2, 7, 19], "ensur": [11, 19, 20, 32], "entir": [2, 16, 28], "enumer": [6, 13, 16, 29], "env": [6, 29], "env_key1": 5, "env_key2": 5, "env_val1": 5, "env_val2": 5, "environ": [2, 5, 6, 17, 20, 24, 28, 30, 31, 32, 33], "ep": [2, 7, 10, 19], "epoch": 16, "equal": [2, 15, 20, 31, 32, 33], "equip": 33, "equival": 34, "error": [2, 5, 6, 7, 10, 16, 18, 21, 22, 26, 34], "especi": [2, 5, 28, 34], "etc": [2, 5, 6, 17, 34], "eval": [2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "eval_func": [2, 16, 34], "eval_funct": 4, "evalu": [2, 16, 34], "even": [2, 5, 7, 33, 34], "evenli": 31, "everi": [2, 28], "exact": 2, "exactli": 21, "exampl": [2, 5, 7, 8, 13, 18, 19, 21, 22, 23, 24, 25, 28, 29, 32, 33, 34], "example_input": [2, 4, 6, 13, 15, 29, 32, 34], "example_kwarg_input": 2, "examplenet": 20, "examplenet1": 20, "examplenet2": 20, "exce": [26, 30, 33, 34], "except": [28, 31], "excess": 34, "excit": 34, "exclus": 31, "execut": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19, 20, 26, 31, 32, 33, 34], "exetens": 2, "exhibit": 30, "exist": [1, 5, 7, 13, 26, 31, 33], "exit": [6, 31], "exp": 13, "expect": [2, 7, 30, 34], "expecttest": 5, "expens": 18, "experi": [5, 7, 10, 12, 16, 18, 26, 33, 34], "experiment": 34, "explain": [17, 18, 21], "explicit": [18, 20, 33], "explicitli": [2, 8, 16, 20, 26, 31, 34], "explor": 2, "expon": 21, "export": [4, 31, 33], "expos": 8, "express": [18, 34], "ext": [6, 34], "extend": [1, 18, 25, 33, 34], "extens": [2, 3, 4, 6, 9, 10, 13, 14, 16, 17, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34], "extra": [2, 5, 10, 20, 31, 32], "extra_rope_config": 2, "extrem": [7, 14, 33], "f": [5, 6, 13, 16, 28, 34], "f1": 30, "f16c": 17, "f32": [17, 18], "f401": [6, 11, 12, 13, 16, 23, 29], "face": 3, "facebook": [3, 6, 28], "facilit": 34, "fact": [18, 33], "factor": [2, 6, 16, 31], "fail": [10, 26, 34], "failur": [12, 34], "fake": 2, "fake_quantize_per_tensor_affin": 8, "falcon": [2, 28, 34], "fall": [6, 12], "fals": [2, 4, 6, 7, 8, 13, 14, 15, 16, 17, 20, 22, 23, 26, 31, 32, 34], "famili": [2, 28, 33], "fashionmnist": 16, "fast": [4, 12, 33, 34], "fast_bert": [2, 4, 6, 7, 11, 34], "fast_layer_norm": [2, 34], "faster": [2, 6, 7, 8, 30, 33], "fastest": 17, "fastlayernorm": [2, 34], "fatal_error": 6, "favorit": 31, "fb": 34, "feasibl": 10, "featur": [0, 1, 2, 3, 5, 8, 10, 13, 14, 18, 20, 23, 25, 26, 28, 30, 31, 32, 33, 34], "feb": 3, "feed": [2, 9, 18], "feedback": 34, "feedforward": 28, "feel": [5, 18, 34], "few": [5, 7, 9, 13, 16, 18, 32, 34], "fewer": 21, "fft_fft": 8, "fft_fft2": 8, "fft_fftn": 8, "fft_hfft": 8, "fft_ifft": 8, "fft_ifft2": 8, "fft_ifftn": 8, "fft_ihfft": 8, "fft_irfft": 8, "fft_irfft2": 8, "fft_irfftn": 8, "fft_rfft": 8, "fft_rfft2": 8, "fft_rfftn": 8, "figur": [1, 2, 21, 28, 33], "file": [2, 4, 5, 6, 8, 14, 15, 16, 17, 18, 31, 34], "filenam": 5, "find": [1, 2, 7, 14, 16, 23, 26, 30, 31, 34], "find_packag": 6, "findavx": 17, "fine": [3, 20, 29, 31, 32, 33, 34], "finer": [1, 7, 20], "finish": [6, 11, 12, 13, 16, 20], "first": [2, 3, 5, 6, 7, 9, 10, 12, 16, 19, 20, 21, 26, 31, 32, 33], "firstli": [2, 28], "fit": [5, 7, 33, 34], "fix": [2, 5, 7, 34], "flag": [2, 5, 7, 17, 20, 31, 34], "flake8": 5, "flan": 28, "flash": 34, "flash_atten_varlen": 2, "flatten": [16, 20], "flexibl": 34, "float": [2, 6, 7, 8, 14, 15, 16, 17, 21, 29, 34], "float16": [2, 8], "float32": [2, 13, 21, 23, 26, 30, 31, 34], "float64": 8, "flourish": 28, "flow": 26, "flush": [6, 23], "fma": 17, "fn_type": 17, "focu": [2, 10, 18, 29, 34], "focus": [13, 34], "fold": [2, 10, 15, 16, 26, 34], "folder": 5, "follow": [1, 2, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34], "footbal": 7, "footprint": [7, 21, 28, 34], "forg": 33, "fork": [17, 33], "format": [2, 5, 6, 7, 9, 14, 22, 26, 28, 31, 33, 34], "format_tag": 18, "former": 6, "formerli": [30, 33, 34], "formula": 21, "forward": [2, 6, 8, 13, 16, 20, 21, 26, 32, 33, 34], "found": [1, 6, 7, 14, 16, 18, 29, 31, 32, 33, 34], "foundat": [18, 33], "fp16": [2, 6, 17, 29], "fp32": [2, 4, 16, 17, 19, 21, 23, 28, 34], "fp32_gw": 21, "fp32_w": 21, "fpn": 30, "fraction": 21, "fractional_max_pool2d": 8, "fractional_max_pool3d": 8, "fragment": 33, "framework": [5, 34], "free": [31, 34], "freez": [6, 8, 10, 13, 15, 16, 20, 23, 26, 32, 34], "freezed_model": [26, 34], "frequenc": [2, 30], "frequent": 7, "friendli": [7, 33], "from": [1, 2, 3, 4, 5, 8, 10, 11, 13, 15, 16, 17, 18, 19, 20, 21, 23, 25, 28, 29, 31, 32, 33, 34], "from_embeddingbag_list": 7, "from_pretrain": [4, 6, 11, 23, 29, 32], "front": [13, 34], "frontend": [1, 2, 7, 20, 28, 34], "frozenbatchnorm": 34, "frozenbatchnorm2d": 7, "fsi": 34, "fulfil": 20, "full": [2, 5, 18, 32, 33, 34], "fulli": [5, 15, 17, 21, 31, 33, 34], "function": [2, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 20, 21, 23, 26, 28, 29, 31, 33, 34], "further": [1, 2, 5, 6, 7, 18, 20, 28, 33, 34], "fuse": [2, 7, 13, 16, 19, 28, 34], "fuse_update_step": 2, "fusion": [1, 2, 7, 10, 21, 28, 34], "futur": [7, 28, 34], "futuretensor": 20, "fx": [3, 7, 10, 26, 34], "g": [2, 7, 8, 16, 17, 18, 28, 34], "gain": [1, 7, 26, 28, 34], "game": 7, "gave": 14, "gb": 20, "gcc": 17, "gcp": 3, "gelu": [2, 13, 34], "gemm": [7, 18, 26, 28, 34], "gen": [3, 30, 34], "gen_": 2, "gen_id": [6, 23], "gen_text": [6, 23], "genai": [1, 7, 28], "gender": 7, "gener": [1, 5, 6, 7, 10, 12, 16, 17, 18, 21, 23, 28, 29, 30, 31, 32, 33, 34], "generate_kwarg": [6, 23], "genv": 31, "geomean": 34, "geqrf": 8, "get": [1, 2, 3, 4, 6, 7, 10, 11, 15, 17, 20, 21, 22, 26, 28, 29, 30, 31, 33, 34], "get_acceler": 29, "get_core_list_of_node_id": 2, "get_cpp_typesize_and_vecs": 17, "get_cpp_typesize_and_vecsize_kernel_fn": 17, "get_cpp_typesize_and_vecsize_kernel_impl": 17, "get_cpp_typesize_and_vecsize_kernel_stub": 17, "get_smooth_quant_qconfig_map": [2, 6, 29], "get_weight_only_quant_qconfig_map": [2, 6, 29], "getattr": [6, 23], "getveclength": 17, "getveclengthkrnl": 17, "gif": 31, "gil": 20, "git": [2, 5, 28], "github": [1, 2, 5, 6, 7, 8, 34], "give": [32, 34], "given": [2, 6, 13, 14, 16, 28], "global": [2, 20, 22, 34], "global_past_key_valu": 6, "gnu": [6, 17, 32], "go": [2, 5, 8], "gomp_cpu_affin": 33, "good": [1, 2, 5, 7, 12, 18, 19, 28, 33, 34], "googl": [3, 5, 28], "gperftool": 33, "gpertool": 33, "gpt": [2, 28, 30], "gpt2": 26, "gptbigcod": [2, 28], "gptj": 2, "gptjforcausallm": 2, "gptq": [2, 6, 34], "gpu": [1, 3, 18, 34], "grad": [7, 19], "grad0": 19, "grad1": 19, "grad_i": 19, "grad_n": 19, "gradient": 7, "grain": [1, 3, 7, 20], "granular": [2, 31, 32, 33], "graph": [1, 4, 8, 10, 16, 23, 26, 31, 34], "graph_for": 13, "graph_mod": [2, 4, 7, 12, 34], "graphic": 33, "great": 33, "greater": 2, "greedi": [6, 23], "grid": 14, "grid_sampl": 8, "grokk": 3, "ground": 21, "group": [2, 19, 20, 33], "group_norm": 8, "group_siz": 2, "gru": 15, "grucel": 15, "gt": [4, 14, 28, 33], "gtest_filt": 5, "guid": [3, 6, 7, 17, 32, 34], "guidanc": 7, "guidelin": 18, "gw": 21, "h": [5, 6, 7, 16, 18, 26, 31, 32], "ha": [0, 1, 2, 7, 10, 14, 17, 18, 20, 21, 26, 28, 30, 31, 33, 34], "had": [6, 33], "half": [2, 7, 17, 21], "halv": 21, "handl": [6, 18, 33], "handler": 32, "hang": [33, 34], "happen": 7, "hard": [18, 26], "hardsigmoid": 34, "hardswish": [13, 34], "hardtanh": 13, "hardwar": [1, 3, 17, 25, 28, 32, 34], "hav": 17, "have": [1, 2, 5, 6, 7, 9, 14, 17, 18, 20, 21, 23, 26, 27, 28, 30, 31, 32, 33, 34], "head": [2, 34], "head_dim": 2, "head_map": 2, "head_mask": 2, "head_num": 2, "head_siz": 2, "header": 17, "heavi": 7, "heavier": 28, "height": 18, "hello": 5, "help": [2, 5, 6, 17, 23, 28, 31, 33, 34], "helper": 2, "here": [5, 8, 10, 13, 16, 17, 18, 20, 26, 32, 33, 34], "herebi": 16, "hero": 34, "heterogen": 34, "heurist": [2, 20, 34], "hf": [6, 28], "hf_beam_sampl": 34, "hf_beam_search": 34, "hf_greedy_search": 34, "hf_sampl": 34, "hidden": [2, 18, 28], "hidden_s": [2, 6], "hidden_st": 2, "high": [19, 21, 33], "higher": [2, 7, 13, 17, 18, 28], "higher_is_bett": 14, "highli": [7, 23, 28, 33, 34], "hinge_embedding_loss": 8, "hint": [2, 20], "histogram": [30, 34], "histogramobserv": [2, 15], "histori": [2, 14, 28], "hobbi": 7, "hold": [18, 33], "home": [31, 32], "homebrew": 5, "hood": 34, "hook": [10, 16], "hopefulli": 7, "host": [30, 34], "hostfil": 31, "hostnam": 31, "hotspot": 28, "how": [1, 2, 10, 15, 17, 18, 23, 28, 31, 32, 33, 34], "howev": [2, 5, 7, 8, 9, 16, 20, 26, 28, 31, 33, 34], "hp": 14, "hpc": 11, "html": [2, 5, 16], "http": [2, 5, 16, 34], "hub": 28, "huber_loss": 8, "hug": 3, "huge": [7, 14, 33], "hugginfac": 34, "huggingfac": [2, 6, 26, 28, 32, 34], "huggingface_transform": 32, "hurt": 20, "hw": 18, "hwc": 18, "hwio": 18, "hwn": 18, "hydra": 31, "hyper": [2, 30, 33, 34], "hyperparam": 14, "hyperparamet": [4, 7], "hyperparamt": 14, "hyperthread": 32, "hypertun": [4, 34], "hypertune_directori": 14, "hypervisor": 34, "hypothesi": 5, "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 26, 27, 28, 29, 30, 32, 33, 34], "i_mpi_pin_domain": 31, "iakv": [2, 28], "ic": 2, "ic_block": 2, "id": [2, 31, 32], "idea": [11, 21, 33], "ideep": [17, 18], "ident": [2, 10, 18], "identif": [6, 17], "identifi": 34, "idx": [2, 28, 31], "ieityuan": 28, "illeg": 34, "illustr": [18, 19, 21, 31, 33], "imag": [8, 13, 18, 33, 34], "image_classifi": 32, "imagenet": [18, 30], "immedi": 7, "immintrin": 17, "impact": [2, 7, 20], "imper": [20, 34], "impl": 17, "implement": [1, 5, 7, 11, 19, 26, 28, 33, 34], "implicit": 18, "implicitli": 6, "import": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 26, 28, 29, 32, 33, 34], "impract": [7, 14], "improv": [1, 3, 7, 8, 13, 20, 22, 28, 30, 32, 33], "in1": 7, "in2": 7, "in3": 7, "in_i": 7, "in_m": 7, "inaccur": 21, "inc": [16, 17, 22, 28], "includ": [1, 2, 5, 6, 7, 10, 14, 15, 17, 23, 26, 27, 28, 30, 34], "inclus": 33, "incorrect": [12, 26, 34], "increas": [1, 2, 3, 21, 26, 28, 30, 33, 34], "independ": 31, "index": [2, 5, 18, 28, 33], "index_copi": 8, "index_to_nam": 32, "indic": [2, 6, 18, 28], "indirect": 2, "indirect_access_kv_cache_attent": [2, 34], "indirectaccesskvcacheattent": [2, 34], "individu": [5, 30], "inductor": [7, 34], "inevit": 10, "inf": 14, "infer": [2, 3, 4, 7, 10, 11, 12, 15, 18, 20, 21, 23, 26, 30, 33, 34], "inferenc": 2, "inference2": 30, "inference3": 30, "inference_mod": [6, 23, 29], "influenc": [31, 33], "info": [2, 6, 17, 26, 31, 32, 34], "inform": [1, 2, 6, 7, 14, 17, 18, 28, 31, 32, 33, 34], "ingredi": 18, "init": [2, 5, 15, 34], "init_alpha": [16, 22], "init_distribut": 29, "init_infer": 29, "init_method": 6, "init_process_group": 6, "initi": [2, 20, 32], "inject": 34, "inlin": 17, "inplac": [2, 4, 6, 13, 15, 18, 23, 32], "input": [2, 6, 7, 9, 10, 13, 15, 16, 17, 18, 22, 23, 26, 29, 30, 32, 33, 34], "input1": 10, "input_channel": 2, "input_hint": 20, "input_id": [6, 23], "input_ids_pad": 6, "input_s": [6, 23], "input_split_hint": [2, 20], "input_tokens_length": [6, 23], "inputpath": 32, "insert": [2, 16], "insid": [2, 5, 20, 31], "inspir": 34, "instal": [4, 5, 6, 23, 25, 26, 28, 33, 34], "instanc": [2, 7, 10, 14, 32, 34], "instance_idx": 31, "instancenorm": 34, "instanti": 6, "instead": [7, 8, 14, 19, 20, 29, 30, 31, 32, 33, 34], "instruct": [1, 2, 5, 6, 7, 8, 17, 21, 23, 24, 25, 28, 30, 33, 34], "int": [2, 6, 7, 14, 17, 23, 26, 29, 31, 34], "int4": [2, 28, 29, 34], "int8": [1, 2, 3, 4, 17, 18, 20, 22, 28, 29, 34], "int8_qconfig": 6, "integ": [28, 31, 33], "integr": [7, 18, 28, 33, 34], "intel": [2, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 17, 20, 21, 22, 23, 25, 26, 27, 28, 29, 34], "intel discrete gpu": 1, "intel optim": 1, "intel_extension_for_pytorch": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 20, 23, 25, 29, 32, 34], "intel_pytorch_extens": [7, 25, 26, 34], "intel\u00ae extension for pytorch*": 1, "intend": 5, "intent": 5, "interact": [7, 34], "interconnect": 33, "interest": 5, "interfac": [5, 6, 18, 26, 28], "intern": [17, 18, 20, 32], "interpret": 31, "interrupt": 32, "intervent": 8, "intra": 2, "intrins": 17, "introduc": [1, 3, 7, 15, 18, 21, 22, 31, 33, 34], "introduct": [0, 2, 7, 28, 33, 34], "invalid": 33, "invers": 8, "investig": [2, 31], "invoc": [1, 7], "invok": [2, 6, 8, 10, 13, 20, 23, 26, 29, 34], "involv": 21, "io": 28, "iostream": 6, "ip": 31, "ipex": [1, 2, 3, 4, 6, 7, 9, 11, 12, 13, 15, 16, 17, 19, 20, 23, 26, 29, 31, 32, 34], "ipex_declare_dispatch": 17, "ipex_define_dispatch": 17, "ipex_en": 32, "ipex_fus": 2, "ipex_register_dispatch": 17, "ipexconfig": 6, "ipexrun": [4, 10, 31, 34], "is_caus": 2, "is_contigu": 18, "is_cus": 2, "is_dynam": [6, 15], "is_hyperthreading_en": 14, "is_runtime_ext_en": 2, "isa": [1, 34], "isa_codegen": 17, "isa_nam": 17, "isacodegen": 17, "issu": [1, 2, 5, 8, 21, 26, 33], "ital": 32, "item": 16, "iter": [2, 16, 21, 28, 34], "its": [2, 6, 7, 8, 14, 17, 21, 28, 30, 31, 32, 33, 34], "itself": [2, 5, 18], "ivalu": 6, "j": [2, 5, 17, 28, 30], "jan": 3, "je": 14, "jemalloc": [30, 32, 34], "jemallocl": 31, "jit": [1, 2, 5, 6, 7, 8, 13, 15, 16, 18, 20, 23, 26, 32, 34], "job": 5, "join": 33, "joint": 34, "joint_net": [26, 34], "json": [2, 6, 15, 16, 32, 34], "jul": 3, "jun": 3, "jupyt": 5, "just": [2, 14, 29, 34], "k": [2, 5], "kcpu": 17, "keep": [5, 12, 18, 21, 28, 32, 33, 34], "kei": [2, 7, 28, 34], "kept": 21, "kernel": [1, 2, 7, 20, 26, 28, 30, 33, 34], "kernel_s": 10, "key_cach": 2, "key_token": 2, "keystrok": 5, "keytensor": 2, "keyword": 2, "kill": 32, "kind": 7, "kineto_librari": 6, "kl_div": 8, "kmp": [31, 33], "kmp_": 20, "kmp_affin": [31, 32, 33], "kmp_blocktim": [31, 32, 33], "knob": [2, 4, 12, 31], "know": 5, "knowledg": 33, "known": [6, 10, 28], "kt": 3, "kv": 2, "kv_cach": [2, 28], "kwarg": [2, 29], "l1318": 2, "l1_loss": 8, "l2": 33, "l23": 2, "l4": 2, "l50": 2, "l76": 2, "label": 8, "lake": [7, 30, 34], "lamb": [19, 21], "land": [7, 34], "landscap": [1, 7, 28], "languag": [1, 2, 23, 24, 25, 26, 29, 34], "lar": 34, "larg": [1, 2, 19, 23, 24, 25, 26, 29, 30, 33, 34], "larger": [2, 20, 30, 31, 33, 34], "last": [3, 10, 21, 26, 34], "last_ind": 6, "latenc": [3, 14, 18, 28, 30, 32, 34], "later": [2, 7, 25, 33], "latest": [1, 2, 25, 28, 30, 34], "launch": [4, 6, 20, 32, 34], "launcher": [7, 13, 31, 33, 34], "law": 7, "layer": [2, 16, 20, 22, 28, 34], "layer_past": 2, "layernorm": [2, 13, 16, 22, 34], "layernorm_modul": 2, "layout": [2, 26, 34], "lazi": 5, "ld": 31, "ld_preload": [20, 31, 32, 33], "ldd": 6, "lead": 28, "leaki": 13, "leaky_relu": 13, "leakyrelu": 34, "learn": [3, 7, 8, 11, 13, 14, 21, 31, 33], "learning_r": [10, 34], "leav": [2, 20, 33], "left": [21, 28, 32], "legal": 34, "legend": 28, "len": [2, 6, 7, 13, 16, 17], "length": [2, 5, 14, 21, 26, 30, 34], "less": [2, 8, 18, 20, 26, 34], "let": [5, 10, 18, 19, 20, 21], "level": [7, 10, 13, 16, 18, 20, 21, 26, 33, 34], "leverag": [1, 7, 11, 28, 32, 34], "lib": [6, 31, 32], "lib64": [31, 32], "libc10": 6, "libdnnl_graph": 6, "libgomp": 33, "libintel": [6, 34], "libiomp": 33, "libiomp5": [20, 31, 32, 33], "libjemalloc": 31, "libpytorch_path": 6, "librari": [1, 2, 5, 6, 7, 17, 20, 32, 33, 34], "libtcmalloc": [31, 32], "libtorch": [6, 34], "libtorch_cpu": 6, "libxsmm": 2, "licens": 17, "lighter": 8, "like": [1, 2, 3, 5, 6, 7, 8, 14, 18, 19, 21, 26, 28, 31, 33, 34], "limit": [5, 8, 10, 20, 26, 32, 33, 34], "linalg_choleski": 8, "linalg_cholesky_ex": 8, "linalg_cond": 8, "linalg_eig": 8, "linalg_eigh": 8, "linalg_eigv": 8, "linalg_eigvalsh": 8, "linalg_householder_product": 8, "linalg_inv": 8, "linalg_inv_ex": 8, "linalg_lstsq": 8, "linalg_matrix_rank": 8, "linalg_qr": 8, "linalg_solv": 8, "linalg_svd": 8, "linalg_svdv": 8, "linalg_tensorinv": 8, "linalg_tensorsolv": 8, "line": [5, 10, 13, 18, 31, 32, 33], "linear": [2, 6, 7, 8, 13, 15, 16, 18, 26, 33, 34], "linear2silumul": [2, 34], "linear_": 2, "linear_bn": 2, "linear_bn_fold": 2, "linear_m": 2, "linear_m_modul": 2, "linear_modul": 2, "linear_relu_stack": 16, "linear_s_modul": 2, "linearadd": [2, 34], "linearaddadd": [2, 34], "lineargelu": [2, 34], "linearize_indices_and_offset": 7, "linearmul": [2, 34], "linearnewgelu": [2, 34], "linearrelu": [2, 34], "linearsilu": [2, 34], "linearsilumul": [2, 34], "link": [1, 6, 17, 34], "linux": [5, 6, 17, 30, 31, 33], "list": [2, 5, 7, 8, 13, 14, 16, 18, 25, 29, 31, 32, 33, 34], "liuhaotian": 28, "live": 5, "ll": [5, 32, 33], "llama": [2, 3, 6, 28, 34], "llama2": [30, 34], "llama3": 34, "llava": [2, 28], "llm": [1, 16, 22, 24, 25, 34], "load": [1, 2, 6, 7, 13, 15, 16, 17, 23, 29, 32, 34], "load_dataset": 6, "load_qconf_summari": 15, "load_state_dict": [2, 34], "loader": 16, "local": [6, 20, 28, 31, 32, 33], "locat": [5, 17, 34], "log": [4, 6, 13, 31, 32, 34], "logic": [2, 14, 18, 32, 33], "login": 6, "logit": 16, "long": [2, 6, 18, 21, 26, 28, 34], "long_factor": 2, "longer": [26, 30, 34], "longform": 26, "look": [5, 14, 16, 18], "loop": [5, 21, 29], "lose": 21, "loss": [2, 5, 6, 8, 16, 18, 21, 26], "loss_fn": 16, "lot": [28, 34], "low": [3, 4, 6, 7, 21, 23, 31, 33, 34], "low_cpu_mem_usag": [6, 23], "low_precision_checkpoint": [2, 6, 29], "lower": [2, 8, 17, 21, 28, 34], "lowest": 2, "lowp": [2, 6], "lowp_mod": [2, 6, 29], "lr": [6, 7, 8, 16, 19], "lr_decai": 19, "lsb": 17, "lscpu": 33, "lstm": [2, 10, 15, 34], "lstmcell": 15, "lstsq": 8, "lt": [4, 28, 30], "lu_solv": 8, "m": [4, 14, 20, 26, 31, 32, 33, 34], "m6i": [30, 32], "m7i": 30, "machin": [3, 5, 6, 7, 14, 17, 26, 31, 32, 33, 34], "maco": 5, "macro": 17, "made": [5, 34], "mai": [1, 2, 3, 5, 6, 7, 8, 9, 16, 17, 18, 20, 26, 28, 31, 32, 33, 34], "main": [1, 2, 5, 6, 14, 20, 31, 32], "mainli": [31, 34], "maintain": 8, "major": 16, "make": [2, 5, 6, 7, 14, 15, 17, 21, 23, 28, 32, 33], "make_tupl": 17, "makefil": 5, "malloc": [14, 31, 33], "malloc_conf": [31, 33], "mamx": 17, "man": [7, 33], "manag": [2, 8, 13, 20, 28, 31], "mandatori": 14, "mani": [5, 14, 28, 31, 33, 34], "manipul": 18, "mantissa": 21, "manual": [2, 7, 10, 14, 18, 20, 34], "manual_se": [6, 11], "map": [2, 6, 18, 30], "mar": [3, 32], "margin_ranking_loss": 8, "mask": [2, 7, 17, 26], "mask_valu": 17, "maskrcnn": [33, 34], "maskrnn": 34, "master": [2, 7, 21, 31], "master_addr": 6, "master_port": 6, "match": [2, 8, 17, 31], "math": 7, "matmul": [2, 8, 13, 26, 34], "matrix": [1, 6, 7, 25, 28], "matrix_rank": 8, "matur": 34, "mavx2": 17, "mavx512bf16": 17, "mavx512bw": 17, "mavx512dq": 17, "mavx512f": 17, "mavx512fp16": 17, "mavx512vl": 17, "mavx512vnni": 17, "max": [2, 6, 16, 17, 22, 23, 26, 34], "max_context_len": 2, "max_new_token": [6, 23], "max_num_blocks_per_seq": 2, "max_position_embed": 2, "max_seq": 2, "max_seq_len": 30, "max_seq_length": [10, 34], "max_seqlen_k": 2, "max_seqlen_kv": 2, "max_seqlen_q": 2, "max_trial": 14, "max_unpool2d": 8, "max_unpool3d": 8, "maxim": 14, "maximum": [2, 16, 17], "maxpool": 34, "maxpool2d": 13, "maycontainalia": 5, "md": 18, "me": 18, "mean": [2, 16, 17, 18, 20, 22, 28, 34], "meant": 34, "meanwhil": [12, 33, 34], "measur": [30, 34], "mechan": [1, 7, 17, 21, 34], "medium": 28, "meet": [21, 33, 34], "meltdown": 30, "membind": 33, "memori": [2, 6, 7, 8, 9, 10, 13, 19, 20, 21, 26, 28, 30, 32, 34], "memory_format": [6, 7, 18, 23], "mention": [3, 10, 20, 21, 34], "merg": [0, 7, 34], "merged_emb": 7, "merged_input": 7, "mergedembeddingbag": 7, "mergedembeddingbagwith": 7, "mergedembeddingbagwithsgd": 7, "merit": 18, "mermori": 2, "messag": [2, 6, 10, 12, 18, 31], "meta": [6, 18, 28, 29, 34], "metadata_thp": [31, 33], "method": [2, 8, 15, 16, 18, 22, 26, 33, 34], "method1": 10, "method2": 10, "methodologi": [2, 6, 7, 19, 33], "methond": 15, "metric": [2, 16, 30], "mfma": 17, "mha": [2, 34], "mhz": 33, "microarchitectur": 33, "microsoft": [2, 28], "might": [2, 7, 18, 26, 33, 34], "migrat": 7, "millisecond": 33, "min": [2, 16, 22, 26, 34], "mind": [18, 32], "mini": [2, 20, 28, 34], "minim": [7, 14, 17, 33], "minimum": [14, 16, 18], "minmax": 34, "minmaxobserv": [2, 6, 15], "misc": 34, "mish": 13, "miss": 5, "mistral": [2, 28, 34], "mistralai": 28, "mitig": [20, 30], "mix": [2, 6, 13, 23, 26, 28, 34], "mixed_dtyp": 34, "mixtral": [2, 28], "mixtur": [8, 34], "mkdir": 6, "mkl": 34, "mkldnn": 18, "mkldnn_util": 18, "mllama": 2, "mlp": 34, "mm": 8, "mmuzzy_decay_m": 33, "mmx": 17, "mno": 17, "mobilenet": 30, "mode": [1, 2, 5, 7, 10, 12, 18, 20, 23, 26, 32, 34], "model": [1, 2, 3, 4, 8, 9, 10, 11, 12, 14, 16, 23, 24, 25, 26, 29, 30, 33, 34], "model1": 20, "model2": 20, "model_execut": 34, "model_id": [6, 23], "model_log": 32, "model_name_or_path": [10, 29, 34], "model_script": 20, "model_service_work": 32, "model_state_dict": 6, "model_stor": 32, "model_to_be_calibr": 34, "modelfamili": 28, "modeling_llama": 2, "modelurl": 32, "modern": 3, "modifi": [2, 5, 6], "modul": [1, 6, 7, 8, 13, 16, 17, 26, 29, 31, 34], "modular": 2, "modulist": 7, "momentum": [6, 10, 21], "monkei": 10, "more": [1, 2, 5, 6, 7, 8, 10, 11, 13, 16, 17, 19, 20, 21, 23, 26, 28, 32, 33, 34], "moreov": [1, 2, 28], "mosaicml": 28, "most": [2, 6, 7, 13, 21, 28, 30, 32, 33, 34], "motherboard": 33, "motiv": [2, 20], "move": [18, 33], "movingaverageminmax": 34, "mp_size": 29, "mpi": 31, "mpiexec": 31, "mpt": [2, 28, 34], "mrpc": 30, "mse_loss": 8, "much": [15, 18, 21, 28, 31, 33], "mul": [2, 13, 16], "multi": [2, 7, 14, 20, 28, 31, 33, 34], "multi_margin_loss": 8, "multi_stream": 2, "multi_stream_input_hint": 34, "multi_stream_model": [20, 34], "multi_stream_output_hint": 34, "multidimension": 18, "multiheadattent": 28, "multilabel_margin_loss": 8, "multilabel_margin_loss_forward": 8, "multipl": [2, 5, 7, 8, 16, 17, 18, 26, 28, 30, 32, 33, 34], "multipli": 2, "multistreammodul": [2, 7, 20, 26, 34], "multistreammodulehint": [2, 20, 34], "multithread": 33, "must": [2, 5, 14, 17, 19], "mutual": 31, "muzzy_decay_m": [31, 33], "my": 18, "mykernel": 17, "mymodel": 34, "mypi": 5, "n": [2, 6, 7, 16, 18, 19, 20, 26, 32, 33, 34], "n1": 18, "n2": 18, "n_iter": 32, "name": [2, 5, 7, 14, 17, 25, 28, 31, 32, 33, 34], "namespac": [8, 17], "nan": [17, 34], "nanquantil": 8, "narg": 6, "narrow": 5, "nativ": [1, 6, 7, 8, 17, 19, 21, 26, 28, 34], "natur": [18, 21, 28], "naver": 3, "nb": 18, "nc": 32, "nchw": [7, 33], "ncore": [10, 31], "ncore_per_inst": [14, 34], "ncores_per_inst": 14, "nd": 18, "necessari": 18, "necessarili": 2, "neck": 19, "need": [2, 5, 6, 7, 10, 13, 14, 16, 17, 18, 19, 20, 21, 23, 26, 29, 31, 32, 33, 34], "need_linearize_indices_and_offset": 7, "neelnanda": 6, "neg": 21, "neglig": 18, "neighbor": 2, "neox": [2, 28], "net": 34, "network": [1, 3, 7, 8, 20, 25, 28, 33], "neural": [1, 3, 7, 16, 22, 25, 28, 33, 34], "neuralnetwork": 16, "new": [3, 5, 12, 16, 17, 18, 20, 23, 26, 29, 33], "new_gelu": 2, "new_layer_past": 2, "newer": [1, 28, 33], "newgeluactiv": 2, "newkernel": 17, "newkernelkrnl": 17, "newli": 34, "newlin": 5, "next": [5, 7, 34], "nf4": [2, 29], "nhwc": [7, 33, 34], "nifti": 33, "ninstanc": [10, 14, 31, 34], "nint": 5, "nll_loss": 8, "nll_loss2d": 8, "nlp": [6, 7, 26, 30, 34], "nm": [7, 34], "nn": [2, 6, 7, 8, 10, 13, 15, 16, 18, 20, 26, 34], "nnc": 26, "nnode": 31, "no_grad": [4, 6, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "node": [2, 20, 30, 32, 33, 34], "node0": 33, "node1": 33, "node_id": [2, 20, 31, 32, 34], "non": [2, 5, 8, 13, 18, 30, 32, 34], "noncontigu": 18, "none": [2, 6, 29, 31], "noqa": [6, 11, 12, 13, 16, 23, 29], "normal": [1, 2, 6, 7, 13, 20, 28, 33, 34], "normalized_shap": 2, "note": [2, 3, 5, 6, 15, 16, 17, 18, 20, 22, 24, 28, 30, 31, 32, 33], "notfound": 6, "noth": 2, "notic": [27, 31, 32], "nov": 3, "now": [2, 7, 15, 18, 32, 33, 34], "np": [16, 31], "nproc": 31, "nth": [32, 33], "num": [2, 20, 32, 33, 34], "num_attention_head": 6, "num_beam": [6, 23], "num_block": 2, "num_featur": 7, "num_head": 2, "num_hidden_lay": 6, "num_kv_head": 2, "num_nod": 14, "num_seq": 2, "num_stream": [2, 20, 34], "num_token": 2, "num_train_epoch": [10, 34], "numa": [2, 20, 31, 32, 34], "numactl": [20, 31, 32], "number": [1, 2, 5, 6, 7, 14, 16, 19, 20, 21, 26, 32, 34], "numer": [2, 8, 33], "numpi": 16, "o": [6, 17, 23, 30], "o0": [2, 26, 34], "o1": [2, 26, 34], "o3": 17, "object": [2, 6, 7, 14, 17, 20, 33, 34], "observ": [2, 9, 13, 15, 34], "obsev": 15, "obtain": 16, "obviou": 28, "occupi": 26, "occur": 34, "occurr": 28, "off": [7, 8, 21, 28, 30, 34], "offer": [1, 5, 33], "offici": [5, 32, 33, 34], "offlin": 34, "offset": [2, 18, 28], "often": 7, "old": 34, "omp": [20, 26, 31, 32, 33, 34], "omp_num_threa": 26, "omp_num_thread": [20, 26, 31, 32, 34], "omp_proc_bind": [31, 33], "omp_schedul": [31, 33], "omp_set_num_thread": 34, "onboard": [19, 33], "onc": [2, 5, 6, 14, 17, 18, 20, 21, 32, 33], "ondevic": 29, "one": [2, 5, 7, 12, 13, 14, 16, 18, 19, 20, 26, 29, 31, 33, 34], "oneapi": [6, 33], "oneccl": [3, 6, 31, 34], "oneccl_bindings_for_pytorch": 6, "onednn": [2, 3, 13, 17, 26, 28, 34], "onednn_primitive_cache_capac": 33, "onednn_verbos": 4, "ones": [2, 6, 17], "onli": [1, 2, 5, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 26, 28, 31, 32, 34], "onlyquantizationint4": 28, "onlyquantizationint8": 28, "oob": [10, 34], "op": [2, 7, 15, 16, 22, 28, 34], "op_type_dict": 2, "open": [1, 16, 28, 33], "openai": 28, "openmp": [2, 7, 20, 26, 30, 32, 34], "oper": [1, 2, 6, 8, 13, 15, 21, 32, 33, 34], "opportunit": 2, "opt": [2, 6, 17, 28], "optdecoderlay": 16, "optim": [1, 3, 4, 6, 8, 9, 11, 12, 14, 16, 18, 20, 21, 23, 25, 26, 31, 32, 33, 34], "optimize_lstm": 2, "optimize_transform": 34, "optimized_model": [2, 34], "optimized_optim": 2, "optimizer_state_dict": 6, "optimum": 10, "optin": 2, "option": [1, 2, 5, 7, 10, 14, 15, 16, 29, 31, 34], "optyp": 2, "order": [2, 17, 18, 21, 31, 33, 34], "org": [2, 7, 16, 26, 34], "organ": 18, "orgqr": 8, "origin": [2, 6, 7, 12, 13, 15, 17, 20, 29, 34], "original_max_position_embed": 2, "original_model": 2, "ormqr": 8, "other": [2, 6, 7, 8, 14, 17, 18, 19, 23, 28, 31, 33], "other_1": 2, "other_2": 2, "other_arg": 19, "otheriws": 13, "otherwis": [2, 7, 20], "our": [5, 16, 19, 28, 33, 34], "out": [2, 5, 6, 7, 8, 10, 13, 16, 19, 20, 30, 31, 33, 34], "outlier": [7, 16], "outplac": [18, 34], "output": [2, 6, 7, 8, 13, 14, 16, 18, 23, 26, 34], "output_concat_hint": [2, 20], "output_dir": [10, 14, 34], "output_hint": 20, "output_tokens_length": [6, 23], "outsid": 20, "outstand": 5, "over": [5, 7, 8, 9, 16, 18, 30, 31, 34], "overal": 33, "overflow": [26, 34], "overhead": [1, 2, 7, 10, 19, 20, 26, 28, 33, 34], "overlap": 32, "overrid": 15, "overridden": [2, 17], "oversize_threshold": [31, 33], "overview": [7, 25, 34], "overwrit": [2, 31], "own": [2, 6, 15, 28], "owner": 13, "p29": 30, "p90": 30, "pack": [2, 20, 34], "packag": [1, 2, 5, 6, 7, 10, 23, 25, 26, 32, 33, 34], "pad": [8, 10, 20, 34], "pad_max": 6, "pad_val": 6, "padding_mod": 34, "page": [2, 6, 13, 20, 24, 29, 30, 33, 34], "pagedattent": [2, 34], "paper": [2, 34], "parallel": [2, 5, 6, 7, 28, 33, 34], "param": [2, 19, 31], "param_i": 19, "param_n": 19, "paramet": [2, 6, 7, 8, 10, 16, 17, 19, 20, 21, 26, 28, 29, 30, 31, 33, 34], "parse_arg": [6, 23], "parser": [6, 23], "part": [3, 5, 7, 8, 18, 21, 26, 31, 33, 34], "parti": 34, "partial": 7, "particular": [5, 6, 8, 29, 34], "partit": [13, 33], "pass": [1, 2, 5, 10, 17, 20, 26, 32, 34], "past": 28, "past_key_valu": [2, 6], "past_kv_length": 2, "patch": [10, 34], "path": [2, 6, 7, 14, 18, 20, 23, 31, 33, 34], "pattern": [7, 11, 18, 28, 34], "pdf": 2, "pdropout": 2, "peak": [2, 7, 11, 34], "penal": 33, "pend": 34, "per": [2, 10, 15, 16, 20, 30, 31, 32, 33, 34], "per_batch": 2, "per_batch_ic_block": 2, "per_batch_ic_block_sym": 2, "per_channel_symmetr": [2, 6, 15], "per_device_train_batch_s": [10, 34], "per_ic_block": 2, "per_tensor": 2, "per_tensor_affin": [6, 15, 34], "per_tensor_symmetr": 15, "perchannelminmaxobserv": [2, 6, 15], "perf": [11, 18], "perfect": 28, "perform": [1, 2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 15, 16, 18, 19, 21, 25, 28, 29, 31], "period": 33, "person": 3, "perspect": [2, 13, 18, 21, 28, 31, 33], "pertain": 17, "phase": [2, 20], "phi": [2, 28, 34], "physic": [2, 14, 20, 32, 33], "pick": 5, "piec": [2, 20], "pile": 6, "pin": [2, 20], "pinvers": 8, "pip": [4, 5, 33, 34], "pip3": 34, "place": [2, 8, 28, 33, 34], "placeholderobserv": [6, 15], "placement": 33, "plai": [7, 33], "plan": [5, 7, 10], "platform": [3, 7, 18, 32, 33, 34], "platinum": [14, 30, 32, 33], "pleas": [2, 6, 7, 11, 16, 22, 26, 28, 29, 31, 33, 34], "plu": 33, "pmi_rank": 6, "pmi_siz": [6, 29], "point": [2, 6, 8, 15, 21, 33, 34], "pointer": 17, "poisson_nll_loss": 8, "polar": 8, "polici": 33, "polish": 34, "polymorph": 17, "pool": [2, 20, 34], "poor": [26, 34], "popular": [1, 7, 22, 28, 30, 34], "popup": 5, "port": 31, "portabl": 11, "portion": 16, "pos_embd_dim": 2, "posit": [2, 28, 33, 34], "position_id": [2, 6], "position_ids_pad": 6, "possibl": [2, 14, 15, 19, 28, 31, 33, 34], "post": [2, 4, 5, 7, 15, 28, 34], "potenti": [3, 7, 34], "pow": 13, "power": [2, 7, 33, 34], "ppn": 31, "pr": [7, 18, 34], "practic": [6, 21, 24, 28, 33], "pragma": 17, "pre": [2, 28, 34], "precis": [2, 4, 6, 13, 21, 23, 26, 30, 34], "pred": 16, "predefin": 2, "predict": 16, "prefer": [1, 7, 8, 15, 24], "prefetchw": 17, "prefetchwt1": 17, "prefil": 2, "prefix": 31, "preload": [2, 31], "prepack": [2, 6, 10, 18, 26, 34], "prepar": [2, 4, 6, 13, 16, 26, 29, 32, 34], "prepared_model": [2, 4, 6, 13, 15, 16, 26, 29, 34], "prerequisit": [5, 6], "present": 32, "pretrain": [6, 32, 34], "pretti": 33, "prevent": 19, "previou": [14, 16, 18, 33, 34], "previous": 32, "primari": 33, "primarili": [8, 34], "primit": [11, 20, 30, 34], "principl": [3, 18], "print": [6, 11, 12, 13, 14, 16, 17, 23, 31], "printf": 5, "prior": [2, 23], "privat": 34, "probabl": 2, "problem": [7, 19, 26, 32, 33], "proc": 31, "procedur": 32, "process": [2, 6, 7, 11, 12, 14, 16, 19, 20, 21, 26, 31, 32, 33], "processor": [3, 7, 19, 21, 28, 30, 33, 34], "proclist": 33, "prod": 8, "produc": [5, 8], "product": [1, 2, 7, 14, 28, 34], "program": [1, 5, 7, 11, 20, 31, 33, 34], "progress": [26, 28, 34], "project": [1, 6], "prompt": [4, 6, 23, 34], "propag": [13, 21, 33], "proper": 34, "properli": 31, "properti": [6, 32], "propos": [5, 7, 11, 16, 18, 21], "prototyp": [4, 13, 20, 26, 34], "provid": [1, 2, 5, 6, 7, 8, 11, 12, 13, 14, 16, 20, 22, 24, 26, 28, 29, 31, 32, 33, 34], "pseudo": [19, 21, 34], "pseudocod": [26, 34], "pt": [6, 13, 14, 15, 23, 32, 34], "pth": 6, "pthread": 20, "ptmalloc": 32, "ptq": 7, "public": 34, "pull": 5, "purlei": 33, "purpos": [17, 31, 32, 33], "push": 34, "push_back": 6, "put": 33, "py": [2, 5, 10, 14, 20, 31, 32, 34], "pyg": 3, "pyi": 5, "pypi": [26, 34], "python": [1, 2, 4, 10, 14, 17, 20, 26, 28, 29, 31, 32, 33, 34], "pytorch": [2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 16, 17, 20, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34], "q": [2, 28], "qa": [10, 34], "qconf_summari": [6, 15, 16, 29], "qconfig": [2, 4, 6, 13, 16, 26, 29, 32, 34], "qconfig_map": 6, "qconfig_summary_fil": [2, 6, 29], "qconfig_summary_file_path": 29, "qconfigmap": 6, "qint8": [2, 6, 15], "qkv": 34, "qparam": 15, "qr": 8, "qscheme": [2, 6, 15, 34], "qualiti": 34, "quant": [2, 16], "quant_method": 2, "quant_stat": 15, "quantconf": 34, "quantil": 8, "quantiz": [1, 3, 4, 13, 22, 26, 28, 30, 32, 34], "quantizat": 2, "quantization_config": [2, 6, 29], "quantize_per_tensor": 26, "quantized_model": [13, 15, 34], "queri": [2, 17, 18], "query_roteri": 2, "query_token": 2, "question": [18, 30], "quick": [1, 20, 24, 25], "quick_check": 5, "quickli": 2, "quicklint": 5, "quickstart_tutori": 16, "quint8": [6, 15], "quit": [17, 34], "qwen": [2, 28, 34], "qwen2": [28, 34], "r": [5, 6, 7, 14, 23, 30, 32, 33], "rais": [2, 10], "rand": [6, 8, 12, 13, 20, 26, 34], "randint": [6, 11, 32], "randn": [2, 10, 13, 16, 18, 32, 34], "random": 14, "rang": [1, 6, 7, 15, 16, 19, 21, 26, 31, 32, 34], "rank": [6, 31, 34], "rapid": 3, "rate": 21, "rather": [2, 18], "ratio": [22, 30, 34], "raw": 2, "rc": 34, "rc3": 34, "re": [5, 8, 32, 33, 34], "reach": 34, "read": [7, 19], "readm": 34, "real": [2, 7, 14, 15, 30, 34], "realli": 5, "realtim": 30, "reason": [2, 10, 18, 20, 34], "rebas": [5, 34], "receip": [16, 20], "receipt": 20, "receiv": 21, "recent": [6, 7, 18], "recip": [2, 4, 7, 13, 15, 26, 28, 34], "recognit": 33, "recommend": [1, 5, 6, 7, 9, 10, 15, 16, 20, 23, 30, 31, 33, 34], "record": [14, 32], "recov": 21, "recurs": 5, "reduc": [1, 2, 7, 15, 19, 20, 21, 22, 26, 28, 33, 34], "reduce_rang": 15, "reduct": 34, "refer": [1, 7, 9, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 32, 34], "refin": 34, "reflection_pad1d": 8, "reflection_pad2d": 8, "regard": 13, "regardless": [8, 34], "region": [2, 8, 17, 33], "regist": [1, 7, 10, 16, 17, 34], "registr": 7, "regress": [9, 34], "regular": [6, 21], "reinstal": [5, 26], "reinterpret": 18, "reinterpret_cast": 17, "rel": [2, 4, 16, 31, 34], "relat": [2, 6, 13, 17, 31, 33, 34], "releas": [1, 17, 18, 26, 30, 33], "reli": [18, 20], "relu": [2, 7, 13, 16, 18, 26, 34], "relu6": 34, "remain": 32, "remaind": [2, 20], "remark": [26, 30, 33], "remot": 33, "remov": [2, 5, 21, 34], "reorder": [2, 18, 28], "reorder_cach": 28, "repeat": [10, 18, 21], "repeatedli": 5, "replac": [2, 5, 7, 10, 26, 34], "replace_dropout_with_ident": 2, "replication_pad1d": 8, "replication_pad2d": 8, "replication_pad3d": 8, "repo": [5, 6, 7], "repo_root": 29, "report": [1, 17], "repres": [5, 7, 21], "represent": 18, "reproduc": 32, "request": [1, 5, 20, 32], "requir": [2, 5, 6, 8, 10, 16, 18, 21, 26, 28, 29, 31, 32, 34], "research": 28, "reserv": 33, "reshape_and_cach": 2, "residu": 31, "resiz": [6, 13], "resnet18": 34, "resnet18_xpu": 34, "resnet34": [30, 34], "resnet3d": 34, "resnet50": [12, 13, 14, 18, 30, 31, 33, 34], "resnet50_weight": [6, 12, 13], "resnext": 30, "resnext101": [18, 34], "resnext3d": 34, "resolv": 34, "resourc": [13, 20, 28, 32, 33], "respect": [14, 16, 30, 31, 34], "respons": 30, "rest": 32, "restart": 32, "result": [1, 2, 6, 10, 12, 14, 16, 18, 20, 21, 30, 31, 32, 33], "retinanet": 34, "retriev": 33, "return": [2, 6, 7, 8, 10, 16, 17, 20, 26, 34], "return_softmax": 2, "return_tensor": [6, 23], "reus": [2, 33], "review": [7, 34], "rf": 5, "rfc": 18, "rh": 17, "right": [7, 21, 23, 28], "risk": 34, "rm": 5, "rms_norm": [2, 34], "rmsnorm": [2, 28, 34], "rmsnorm_modul": 2, "rn50": [13, 34], "rn50_int8_jit": 32, "rn50_ipex_int8": 32, "rnn": 34, "rnncell": 15, "rnnt": [26, 34], "ro": 2, "roberta": [26, 34], "roialign": [7, 34], "role": 33, "root": [6, 13, 16, 17, 28], "rope": [28, 34], "rope_modul": 2, "rotari": [2, 28], "rotary_dim": 2, "rotary_embed": [2, 34], "rotary_half": 2, "rotary_ndim": 2, "rotaryembed": [2, 34], "roughli": 18, "round": [13, 21], "rounding_bia": 17, "row": 7, "rst": 5, "rule": [21, 34], "run": [2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 26, 30, 31, 32, 33, 34], "run_20210712212258_inst": 31, "run_20210712212258_instance_0_cores_0": 31, "run_20210712214504_inst": 31, "run_20210712214504_instance_0_cores_22": 31, "run_20210712220928_inst": 31, "run_20210712220928_instance_0_cores_0": 31, "run_20210712221150_inst": 31, "run_20210712221150_instance_0_cores_0": 31, "run_20210712221150_instance_1_cores_22": 31, "run_20210712221305_inst": 31, "run_20210712221305_instance_0_cores_0": 31, "run_20210712221305_instance_1_cores_11": 31, "run_20210712221305_instance_2_cores_22": 31, "run_20210712221305_instance_3_cores_33": 31, "run_20210712221415_inst": 31, "run_20210712221415_instance_0_cores_0": 31, "run_20210712221415_instance_10_cores_40": 31, "run_20210712221415_instance_1_cores_4": 31, "run_20210712221415_instance_2_cores_8": 31, "run_20210712221415_instance_3_cores_12": 31, "run_20210712221415_instance_4_cores_16": 31, "run_20210712221415_instance_5_cores_20": 31, "run_20210712221415_instance_6_cores_24": 31, "run_20210712221415_instance_7_cores_28": 31, "run_20210712221415_instance_8_cores_32": 31, "run_20210712221415_instance_9_cores_36": 31, "run_20210712221615_inst": 31, "run_20210712221615_instance_0_cores_11": 31, "run_20210712223308_inst": 31, "run_20210712223308_instance_0_cores_0": 31, "run_20210713152500_instance_0_cores_0": 31, "run_20210713153048_instance_0_cores_0": 31, "run_20210713153333_instance_0_cores_0": 31, "run_20210713153659_instance_0_cores_0": 31, "run_20220106130151_instance_0_cores_0": 31, "run_benchmark": [26, 34], "run_qa": [10, 34], "runner": 5, "running_mod": 34, "runtim": [1, 8, 13, 17, 31, 33, 34], "runtimeerror": [26, 34], "s1": 20, "s7": 34, "s8": 34, "sacrif": 8, "sai": 5, "salesforc": 28, "same": [2, 5, 7, 10, 15, 16, 17, 18, 20, 21, 28, 31, 32, 33, 34], "same_model_execution_again": 34, "sampl": [2, 6, 9, 14, 16, 17, 29, 33], "sample_input": [2, 9, 34], "sample_text_captum_input": 32, "sampler": 6, "sampling_s": [2, 4, 16, 34], "sapphir": 3, "satisfi": [15, 26], "satur": 34, "save": [2, 5, 6, 7, 13, 14, 15, 16, 18, 21, 28, 32, 34], "save_qconf_summari": [6, 15, 16, 29], "scalabl": [3, 7, 21, 28, 30, 33, 34], "scalar": 2, "scalartyp": 17, "scalartypetocpptyp": 17, "scale": [2, 3, 6, 15, 28], "scale_attn": 2, "scaled_dot_product_attent": 2, "scatter": 31, "scenario": [2, 6, 7, 18, 33, 34], "schedul": [1, 2, 13, 20, 31, 33], "scheme": 32, "scope": [2, 7, 8, 21, 34], "script": [1, 2, 3, 4, 5, 6, 7, 8, 10, 14, 17, 20, 23, 24, 26, 28, 29, 30, 32, 33, 34], "scriptmodul": [2, 13, 20], "sdk": 34, "search": [1, 2, 4, 5, 7, 16, 22, 28, 31], "sec": 30, "second": [2, 10, 28, 32, 33], "secondli": 28, "secret": 18, "section": [1, 6, 7, 8, 14, 20, 23, 24, 25, 28, 29, 32, 33, 34], "secur": 3, "see": [1, 2, 5, 8, 14, 34], "seed": 2, "seen": 28, "select": [2, 5, 7, 13, 24, 34], "self": [2, 6, 8, 10, 16, 20, 26, 34], "selu": 34, "semant": 18, "sens": 21, "sep": [3, 17], "separ": [7, 19, 27, 33], "seq_classification_artifact": 32, "seq_info": 2, "seq_len": [2, 30], "seq_length": [6, 11, 32], "seqlen_k": 2, "seqlen_q": 2, "sequenc": [2, 18, 21, 28, 34], "sequenti": 16, "seri": 33, "serv": [20, 34], "server": [32, 33], "servic": [6, 28, 30, 33], "session": 30, "set": [1, 2, 4, 5, 6, 7, 8, 14, 15, 16, 17, 21, 24, 26, 28, 30, 31, 32, 33, 34], "set_flush_denorm": 33, "set_format": 6, "set_glob": 6, "set_num_thread": [26, 34], "set_properti": 6, "sete": 15, "settensorexprfuseren": 26, "setup": [5, 6, 28, 34], "setup_config": 32, "setup_lint": 5, "sever": [2, 7, 10, 19, 30, 31, 34], "sgd": [2, 6, 7, 8, 16, 19], "sgemm": 34, "sha": 17, "shall": [5, 18, 33], "shape": [2, 6, 7, 16, 20, 23, 30, 33, 34], "shard": 28, "share": [1, 5, 6, 16, 20, 32, 33, 34], "share_weight_observ": 2, "shared_criterion": [16, 22], "sheet": 23, "shift": 21, "ship": 28, "short_factor": 2, "shortcut": 34, "shorten": 5, "shorter": [21, 28], "should": [2, 5, 8, 15, 20, 28, 31, 33], "show": [8, 17, 21, 28, 29, 30, 31, 32, 33, 34], "shown": [1, 6, 18, 28, 31, 32], "shuffl": 6, "shufflenet": 30, "shufflenetv2_x1": 30, "side": [15, 33], "sigmoid": [13, 34], "sign": 21, "signficantli": 32, "signifi": 28, "signific": 21, "significantli": [28, 34], "silu": [2, 13], "similar": [15, 17, 33], "similarli": 32, "simpl": [5, 7, 8, 11, 18, 33, 34], "simplenet": [8, 34], "simpli": [6, 7, 26, 31], "simplifi": [10, 34], "simultan": 20, "sin": 2, "sinc": [6, 7, 18, 19, 20, 21, 26, 33, 34], "sincer": 34, "singl": [2, 7, 13, 14, 16, 19, 20, 30, 32, 34], "single_query_cached_kv_attent": 2, "site": 32, "situat": [7, 14], "six": 33, "size": [2, 6, 7, 11, 15, 16, 17, 18, 23, 26, 28, 30, 32, 33, 34], "sizeof": 17, "skip": [5, 6, 17, 18], "skip_special_token": [6, 23], "skylak": 15, "sleef": 17, "sleep": 33, "slice": [6, 18], "sliu": 34, "slope": 2, "slot": [2, 30], "slot_map": 2, "slow": 34, "slower": [8, 33, 34], "small": [7, 19, 33, 34], "smaller": [8, 17], "smooth": 7, "smooth_l1_loss": 8, "smoothquant": [2, 6, 7, 16, 22, 28, 34], "smoothquant_arg": [2, 16], "snippet": [10, 29], "so": [2, 5, 6, 7, 8, 15, 17, 18, 20, 30, 31, 32, 33, 34], "sock": 32, "socket": [14, 30, 32, 33, 34], "soft_margin_loss": 8, "softmax": [2, 13, 34], "softmax_scal": 2, "softwar": [3, 27, 34], "sole": 33, "solut": [2, 7, 26, 28, 34], "solv": [7, 19, 33], "some": [2, 5, 7, 8, 13, 16, 17, 18, 20, 26, 28, 31, 32, 33, 34], "someth": 18, "sometim": [31, 33], "sophist": 33, "sourc": [1, 5, 6, 17, 27, 28, 33, 34], "space": [2, 7, 16, 18, 22, 33], "spars": [7, 18, 34], "sparsiti": 2, "spawn": [7, 20], "special": [17, 18, 28], "specif": [1, 2, 5, 6, 7, 12, 18, 20, 26, 28, 31, 33, 34], "specifi": [2, 5, 6, 14, 20, 31, 33, 34], "specifii": 17, "spectr": 30, "speech": [3, 33], "speed": [2, 7, 11, 19, 28, 33, 34], "speedup": [2, 6, 8, 28, 30, 34], "sphinx": 5, "split": [2, 6, 7, 16, 17, 19, 20, 26, 34], "split_bf16_from_fp32": 21, "split_master_weight_for_bf16": 2, "splitsgd": [7, 21], "spontan": 18, "sqrt": [2, 13, 19], "squad": [10, 30, 34], "squar": [13, 28], "squenc": 2, "src": [2, 17], "src_data_ptr": 18, "src_md": 18, "src_mem": 18, "ssd": [30, 34], "sse": 17, "sse2": 17, "sse3": 17, "sse4_1": 17, "sse4_2": 17, "ssse3": 17, "stabil": [2, 8, 34], "stabilityai": 28, "stabl": [2, 3, 8, 34], "stablelm": [2, 28], "stack": [6, 8], "stage": [7, 10, 19, 20, 29, 33, 34], "stakehold": 34, "stall": 33, "standard": [1, 34], "stanford": 34, "starcod": [28, 34], "start": [1, 3, 4, 5, 6, 7, 10, 20, 24, 34], "start_dim": 20, "state": [2, 15, 19, 28], "state_dict": [2, 6, 34], "state_sum": 19, "state_sum_i": 19, "state_sum_n": 19, "statement": [14, 17], "static": [2, 4, 16, 26, 28, 31, 32, 33, 34], "static_quantized_model": 6, "staticquantizationint8": 28, "statist": 7, "statu": 17, "std": [6, 17, 19], "stdio": 5, "stdout": 31, "stead": 17, "steam": [20, 34], "step": [2, 5, 6, 7, 8, 14, 16, 19, 21, 32], "step_siz": [16, 22], "stft": 8, "stick": 7, "still": [2, 5, 7, 8, 13, 16, 18, 21, 26, 34], "stock": [13, 30, 34], "stop": [2, 33], "storag": 19, "store": [2, 17, 18, 19, 21, 28, 31, 32, 33, 34], "store_tru": [6, 23], "str": [2, 6, 14, 23, 31], "straight": [13, 33], "straightforward": 34, "strategi": [14, 31, 33, 34], "stream": [2, 7, 20, 34], "streamlin": 34, "strict": [6, 32], "stride": [8, 10, 20, 34], "stride_c": 18, "stride_h": 18, "stride_n": 18, "stride_w": 18, "string": [2, 31], "structur": [1, 18, 31, 34], "style": [2, 5], "sub": [20, 28, 33], "subfold": 17, "subgraph": 2, "subject": [7, 17, 20, 27, 34], "submit": [1, 5, 7, 20], "submodul": 5, "subsequ": [18, 33], "substr": 5, "success": [10, 24], "suffer": 20, "suffix": 17, "suggest": [1, 2, 15, 18, 20, 33, 34], "suit": 5, "sum": [13, 16, 18, 19, 34], "summar": 26, "summari": [6, 34], "super": [8, 10, 16, 20, 26, 34], "superset": 20, "suppli": 8, "support": [2, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 25, 26, 28, 29, 31, 32, 33, 34], "suppos": [2, 6, 14, 33], "sure": [5, 14, 15, 32, 33], "svd": 8, "sw": 30, "swish": 34, "switch": [7, 17, 31, 33, 34], "sy": 30, "sycl": 1, "symbol": 20, "symeig": 8, "symlink": 5, "symmetr": [2, 15], "sync": [5, 20], "synchron": [20, 26, 34], "sysctl": 33, "system": [17, 33], "systemat": 7, "t": [2, 5, 7, 8, 14, 15, 16, 17, 18, 20, 26, 32, 34], "t5": [2, 26, 28, 34], "t_valu": 17, "tab": 5, "tabl": [2, 7, 17, 28, 30, 34], "tackl": 7, "tacotron2": 34, "take": [1, 2, 7, 8, 10, 12, 13, 14, 18, 21, 25, 26, 30, 31, 33], "taken": 32, "tanh": [13, 34], "target": [5, 6, 10, 13, 14, 17, 34], "target_link_librari": 6, "target_v": 14, "task": [2, 7, 28, 31, 33, 34], "task1": 20, "task2": 20, "taskset": 31, "tbd": 26, "tc": 14, "tcmalloc": 32, "te": 34, "team": [1, 5], "techniqu": [1, 2, 7, 11, 12, 28, 34], "technolog": [1, 7, 28], "technologi": [3, 7], "tee": 31, "tell": [18, 20, 31, 33], "temperatur": [6, 23], "tenosr": 2, "tensor": [2, 6, 7, 8, 11, 15, 16, 17, 20, 26, 28, 32, 34], "tensorexpr_fus": 26, "tensorflow": 18, "tensoriter": 18, "terabyt": 30, "term": 27, "termin": 14, "test": [7, 16, 17, 30, 34], "test_": 5, "test_alias_analysi": 5, "test_bceloss": 5, "test_data": 16, "test_dataload": 16, "test_jit": 5, "test_mseloss": 5, "test_nn": 5, "test_sequenti": 5, "testclassnam": 5, "testjit": 5, "testnam": 5, "testnn": 5, "testsuit": 5, "text": [3, 6, 26, 28, 30, 33], "text_max_length": 2, "tgi": 34, "than": [2, 5, 7, 17, 18, 20, 21, 26, 31, 33, 34], "thank": [5, 34], "thei": [7, 8, 31, 33], "them": [1, 5, 7, 18, 19, 28, 31, 33], "themselv": [31, 34], "therefor": 33, "thi": [2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31, 34], "thing": [14, 33], "third": [19, 34], "those": [2, 15, 33], "though": [2, 7], "thrash": 33, "threa": 34, "thread": [1, 2, 7, 20, 26, 30, 31, 32, 33, 34], "three": [7, 16, 17], "threshold": 33, "through": [1, 2, 6, 7, 8, 12, 25, 28, 33, 34], "throughput": [2, 3, 18, 20, 26, 28, 30, 34], "thu": [2, 7, 8, 10, 18, 20, 21, 28, 31, 32, 33], "thudm": 28, "tidi": 5, "tightli": 34, "tiiuae": 28, "tile": 17, "time": [2, 5, 7, 14, 16, 17, 18, 19, 26, 28, 30, 33, 34], "timeout": [2, 5, 21], "timestamp": [2, 28], "tip": 17, "tmp": [10, 32, 34], "to_bfloat16_train": 7, "to_dens": 18, "to_mkldnn": 18, "togeth": [7, 14, 20, 33, 34], "toggl": 7, "token": [2, 6, 23, 28, 30], "tokenize_funct": 6, "tolist": 16, "tool": [17, 33, 34], "toolset": 17, "top": [10, 21, 34], "top1": 30, "toplevel": 5, "topologi": [7, 18, 19, 26, 30, 31, 33, 34], "torch": [1, 2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 18, 20, 23, 26, 29, 32, 33, 34], "torch_ccl": 6, "torch_check": 17, "torch_dtyp": [6, 23], "torch_ipex": [17, 34], "torch_ipex_librari": 6, "torchconfig": 6, "torchdynamo": [1, 7, 12, 23, 34], "torchrun": 34, "torchscirpt": 2, "torchscript": [1, 2, 5, 7, 10, 11, 12, 19, 23, 26, 32, 34], "torchserv": [3, 34], "torchvis": [6, 10, 12, 13, 16, 18, 32, 34], "torchvison": 34, "total": [2, 6, 30, 33], "total_new_token": [6, 23], "totensor": [6, 13, 16], "tpp": 2, "trace": [1, 6, 7, 8, 12, 13, 15, 16, 20, 23, 26, 32, 34], "trace_model": 34, "traced_model": [6, 10, 13, 15, 16, 26, 34], "traced_model1": 20, "traced_model2": 20, "track": 1, "track_running_stat": 10, "trade": [8, 28, 30, 34], "tradeoff": 15, "trail": [5, 21], "train": [2, 3, 4, 7, 11, 13, 15, 16, 18, 21, 23, 26, 28, 31, 34], "train_dataload": 16, "train_dataset": [6, 13], "train_load": [6, 8], "training_data": 16, "transfer": 33, "transform": [2, 3, 4, 6, 10, 11, 13, 16, 18, 22, 23, 28, 29, 32, 33, 34], "transformer_handler_gener": 32, "transformerencoderlay": 26, "transnetv2": 34, "transpar": [2, 7, 29, 33, 34], "transpos": [13, 34], "tree": [5, 6], "tri": 12, "trial": 14, "triangular_solv": 8, "trigger": 12, "triplet_margin_loss": 8, "true": [2, 4, 6, 10, 12, 13, 14, 15, 16, 17, 22, 23, 31, 32, 33, 34], "trust_remote_cod": [6, 23], "truth": 21, "try": [2, 5, 6, 7, 12, 14, 16, 26, 31, 33, 34], "tunabl": [30, 32], "tune": [2, 3, 4, 7, 8, 15, 20, 26, 28, 29, 31, 32, 34], "tuned_conf": 16, "tuned_model": [4, 16, 34], "tunin": 32, "tuning_tim": [2, 4, 16, 34], "tupl": [2, 6, 17, 20], "turboboost": 30, "turn": [7, 34], "tutori": [5, 6, 15, 16, 29, 34], "two": [2, 7, 14, 16, 20, 21, 28, 32, 33, 34], "txt": [5, 6, 32], "type": [2, 4, 5, 6, 7, 10, 16, 17, 18, 20, 21, 23, 30, 31, 32, 34], "types": 17, "typic": [6, 10, 28, 33, 34], "u": [30, 32], "u7": 34, "u8": 34, "ubuntu": 30, "ucod": 30, "uint32_t": 17, "ultra": 33, "uma": 33, "unabl": 10, "unalign": [17, 34], "uncas": [4, 6, 10, 11, 32, 34], "undefin": [2, 20, 33], "under": [2, 6, 8, 18, 20, 27, 31, 34], "undergo": 26, "underhood": 34, "underli": [1, 17, 28], "underneath": 34, "understand": [21, 28, 33], "undesir": 31, "unexpect": 2, "unifi": [2, 31], "uniform": 32, "uninstal": 5, "union": 2, "unit": [1, 2, 33], "unittest": 5, "unix": 32, "unlik": 6, "unlist": 8, "unnecessari": 33, "unpack": [26, 34], "unpad": 2, "unpredict": 2, "unrel": 6, "unsign": 34, "unsqueez": 2, "unstabl": 8, "until": [5, 20, 21, 33], "untrack": 5, "unus": [31, 33], "unutil": 32, "up": [2, 3, 7, 11, 20, 24, 28, 33, 34], "updat": [2, 5, 7, 16, 19, 21, 22, 34], "upgrad": 34, "upi": 33, "upload": 34, "upper": [18, 33], "upsampl": [18, 34], "upstream": [7, 18, 34], "url": [32, 34], "us": [1, 2, 3, 4, 5, 6, 11, 14, 15, 17, 18, 19, 21, 23, 24, 25, 26, 27, 28, 32, 33, 34], "usabl": 34, "usag": [2, 6, 7, 8, 23, 25, 32, 33, 34], "use_all_nod": 14, "use_default_alloc": [32, 34], "use_logical_cor": [14, 32], "user": [1, 2, 7, 9, 10, 12, 13, 15, 16, 18, 20, 26, 31, 32, 33, 34], "user_model": [6, 15], "usr": [6, 17, 31, 32], "usual": [2, 18, 20, 33], "usuali": 33, "usus": 32, "ut": 31, "util": [1, 6, 7, 10, 13, 15, 16, 18, 21, 28, 31, 33, 34], "ux": 34, "v": 5, "v0": [28, 34], "v1": [28, 34], "v2": [28, 30, 34], "v3": 34, "valid": [2, 21, 34], "valu": [2, 6, 10, 14, 16, 17, 19, 20, 21, 22, 26, 28, 31, 32, 33, 34], "value_cach": 2, "value_token": 2, "var": 29, "vari": 16, "variabl": [2, 5, 17, 30, 31, 32, 33, 34], "varianc": 34, "variance_epsilon": 2, "variant": [2, 8, 28, 34], "variou": [6, 7, 14, 28, 33, 34], "varlen_attent": [2, 34], "varlenattent": [2, 34], "varlenattention_modul": 2, "ve": 34, "vec256": 17, "vec512": 17, "vec_bia": 17, "vector": [1, 2, 6, 17, 18, 25, 28], "vectors": 17, "verbos": [2, 4, 31], "verbose_off": 2, "verbose_on": 2, "verbose_on_cr": 2, "veri": [2, 5, 15, 18, 28], "verifi": [6, 7], "version": [6, 7, 16, 17, 25, 26, 27, 32, 33, 34], "vgg": 30, "vgg11": 30, "via": [2, 5, 6, 7, 18, 20, 30, 31, 33, 34], "video": 7, "view": [13, 18, 20, 21], "view_as_complex": 8, "virtual": 17, "virtual_env": [31, 32], "vision": [3, 6, 30], "visit": [7, 33], "vllm": [2, 34], "vm": 34, "vnni": [1, 15, 17, 25, 28], "vocab_s": [6, 11, 32], "voic": 33, "void": 17, "vstack": 6, "w": [7, 16, 18, 21, 30, 32], "wa": [7, 31, 32, 33, 34], "wai": [5, 10, 16, 18, 28, 34], "wait": [20, 33], "wake": 20, "walk": 34, "want": [2, 5, 7, 14, 15, 17, 20, 31, 34], "warm": 33, "warn": [5, 6, 12, 31, 32, 34], "wav2vec2": 33, "wave2vec": 34, "wc": 18, "we": [1, 2, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 23, 28, 30, 32, 33, 34], "web": 28, "webpag": 34, "websit": 7, "wei_ic_observ": 2, "wei_observ": 2, "weight": [1, 2, 7, 10, 12, 13, 15, 16, 18, 20, 22, 23, 26, 28, 34], "weight_dacai": 21, "weight_decai": [7, 19], "weight_dtyp": [2, 6, 29], "weight_qschem": 2, "weights_prepack": [2, 6, 7, 23, 26], "well": [1, 2, 5, 6, 7, 11, 16, 20, 21, 24, 28, 32, 33, 34], "were": [30, 31, 32, 33], "west": 30, "what": [3, 5, 6, 8, 23], "wheel": 34, "when": [2, 5, 6, 7, 8, 9, 14, 18, 19, 20, 21, 22, 25, 26, 28, 30, 31, 32, 33, 34], "where": [2, 5, 7, 16, 21, 33, 34], "wherea": 30, "whether": [2, 6, 8, 16, 18, 22, 23, 33], "which": [1, 2, 5, 7, 8, 10, 14, 15, 16, 17, 18, 20, 26, 28, 30, 31, 32, 33, 34], "while": [2, 7, 8, 11, 12, 18, 21, 26, 28, 31, 32, 33, 34], "whisper": [2, 28, 34], "whl": 34, "who": 10, "whole": [19, 20, 33], "wide": [21, 34], "wider": 1, "widespread": [1, 7, 28], "width": [17, 18], "wikipedia": [13, 33], "wise": [2, 16, 19, 22, 29, 34], "wish": [5, 7], "with_arg": [2, 6, 15], "within": [5, 16, 21, 29, 33, 34], "without": [2, 5, 6, 7, 8, 10, 16, 20, 21, 26, 32, 34], "wlydcrb1": 30, "wn": 18, "won": [2, 7, 8, 17, 26], "woq": [2, 28], "woqactquantmod": 2, "woqlowpmod": [2, 6, 29], "woqweightdtyp": [2, 6, 29], "woqweightqschem": 2, "work": [2, 5, 6, 7, 14, 15, 17, 20, 26, 28, 29, 31, 33, 34], "workabl": 2, "workaround": [26, 34], "worker": [20, 31], "workflow": 34, "workload": [1, 6, 7, 8, 10, 11, 12, 21, 26, 28, 29, 30, 31, 33, 34], "workload1": 30, "workspac": 6, "world": [5, 7], "world_siz": [6, 29], "worri": 32, "wors": 2, "worth": 34, "would": [2, 5, 6, 14, 16, 17, 18, 30, 31, 32, 33, 34], "wrap": 34, "write": [7, 17], "written": [5, 6, 17], "x": [1, 2, 5, 6, 8, 10, 13, 15, 16, 17, 18, 20, 21, 23, 26, 34], "x1": 20, "x2": 20, "x86": 3, "x86_64": 30, "xcr0": 17, "xdf": 5, "xe": 33, "xeon": [3, 7, 14, 21, 28, 30, 32, 33, 34], "xl": 28, "xlm": 26, "xmx": 1, "xpu": [1, 2, 3, 34], "xsave": 17, "xx": 6, "xx_c": 34, "xx_v": 34, "y": [8, 15, 16, 20, 21, 34], "y1": 20, "y1_futur": 20, "y2": 20, "y2_futur": 20, "y_runtim": 20, "yaml": 14, "ye": 5, "year": 28, "yet": [2, 6, 26, 34], "yield": [1, 7, 33], "yolov3": 34, "you": [1, 2, 5, 6, 7, 8, 13, 14, 15, 17, 18, 20, 23, 25, 26, 28, 29, 31, 33, 34], "your": [1, 5, 6, 7, 8, 10, 14, 15, 20, 23, 24, 26, 27, 28, 29, 34], "your_calibration_dataset": 29, "your_conf_fil": [4, 34], "your_generation_param": 34, "your_python_script": [4, 34], "your_pytorch_script": [4, 31], "yuan": [2, 28], "yuan2": 28, "z11pa": 33, "zero": [6, 15, 34], "zero_grad": [6, 7, 16], "zero_tensor": 2, "zip": [6, 23, 34], "zone": [30, 34], "zoo": [6, 30], "\u03b1": 21}, "titles": ["Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc", "Intel\u00ae Extension for PyTorch*", "API Documentation", "Blogs & Publications", "Cheat Sheet", "Contribution", "Examples", "Features", "Auto Mixed Precision (AMP)", "Auto Channels Last", "Codeless Optimization (Prototype)", "Fast BERT (Prototype)", "Graph Capture (Prototype)", "Graph Optimization", "HyperTune (Prototype)", "Intel\u00ae Extension for PyTorch* optimizations for quantization", "INT8 Recipe Tuning API (Prototype)", "ISA Dynamic Dispatching", "Channels Last", "Optimizer Fusion", "Runtime Extension", "Split SGD", "Smooth Quant Recipe Tuning API (Prototype)", "Quick Start", "Installation", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimization Overview", "LLM Optimizations Frontend API", "Performance", "Launch Script Usage Guide", "TorchServe with Intel\u00ae Extension for PyTorch*", "Performance Tuning Guide", "Releases"], "titleterms": {"": 34, "0": [6, 7, 34], "1": [7, 14, 32, 34], "10": [30, 34], "100": 34, "11": [30, 34], "12": 34, "13": [7, 34], "2": [6, 7, 14, 32, 34], "200": [30, 34], "2xlarg": 30, "3": [32, 34], "300": 34, "4": [32, 34], "8": 34, "9": 34, "That": 18, "The": 10, "__call__": 10, "access": [28, 33], "accuraci": 30, "add": 17, "ai": [6, 30], "algorithm": 16, "all": [18, 31], "alloc": [31, 33], "alpha": [16, 34], "alreadi": 10, "amp": [7, 8], "an": 30, "api": [2, 7, 9, 13, 16, 17, 18, 22, 25, 28, 29], "appli": 10, "architectur": 1, "archiv": 32, "asynchron": 20, "aten": [17, 18], "attr": 10, "auto": [7, 8, 9, 16, 20], "autocast": 8, "autotun": 16, "aw": 30, "b": 18, "basic": 20, "behavior": 8, "benchmark": 32, "bert": [2, 6, 7, 11, 32], "beta": [6, 7], "better": 5, "bf16": [6, 10, 13, 29], "bfloat16": [6, 8, 21, 26, 30], "bind": 20, "block": 18, "blog": 3, "boost": 32, "build": [5, 17], "c": [5, 6, 18], "c6i": 30, "cach": [28, 33], "calibr": [6, 15], "can": 8, "captur": [7, 12], "case": [8, 10, 20], "center": 30, "chang": 34, "channel": [7, 9, 18, 33], "cheat": 4, "check": 17, "code": 17, "codegen": 17, "codeless": [7, 10], "command": 10, "common": 29, "compil": [7, 17], "configur": [20, 30, 33], "content": [32, 33], "contribut": 5, "convers": 18, "convert": 15, "convolut": 18, "core": [20, 31, 32], "correct": 26, "coverag": 18, "cpp": 17, "cpu": [0, 2, 17, 18, 33], "creat": [18, 32], "creation": 18, "csrc": 17, "custom": [17, 28], "d": 18, "data": [28, 30], "debug": [5, 17], "deepspe": [28, 29], "default": [8, 9, 14, 18, 31], "defin": [14, 15], "demo": 28, "denorm": 33, "deploi": [15, 32], "deploy": 6, "descent": 21, "descript": [11, 12], "design": [0, 17, 20, 31], "detail": 20, "determin": 16, "develop": 5, "disabl": 9, "dispatch": [0, 7, 17], "dispatchstub": 17, "distribut": [6, 28, 29], "do": 15, "doc": 0, "document": [2, 5, 25, 32, 33], "dure": 20, "dynam": [0, 6, 7, 15, 17, 26], "dyndisp": 17, "eager": [6, 8], "eas": [9, 13], "easi": 7, "ec2": 30, "elig": 8, "enabl": 9, "exampl": [6, 10, 11, 12, 14, 16, 17, 20, 31], "examples1": 20, "examples2": 20, "examples3": 20, "explicitli": 10, "export": 32, "extens": [0, 1, 5, 7, 15, 20, 26, 32], "fast": [2, 6, 7, 11], "featur": [6, 7, 11, 12, 17], "file": 32, "fix": 16, "float32": [6, 8], "fold": 13, "folder": 17, "format": 18, "forward": 10, "fp32": [6, 10, 13, 29, 30], "from": [6, 7], "frontend": 29, "fusion": [13, 19], "gener": [2, 26], "get": 25, "gnu": [31, 33], "gradient": 21, "graph": [2, 7, 12, 13, 28], "guid": [31, 33], "h": 17, "hardwar": [30, 33], "highlight": 34, "how": 20, "huggingfac": 10, "hyperparamet": 14, "hypertun": [7, 14], "i": [18, 20, 31], "ii": 31, "iii": 31, "implement": [17, 20], "improv": 34, "includ": 31, "index": 31, "indirect": 28, "infer": [6, 8, 28, 29, 31, 32], "input": [8, 20], "instal": [24, 32], "instanc": [28, 30, 31], "instead": 10, "int4": 6, "int8": [6, 7, 13, 16, 26, 30, 32], "intel": [0, 1, 5, 6, 15, 30, 31, 32, 33], "intrin": 17, "introduct": [8, 19, 25], "iomp": 20, "ipex": [10, 28], "isa": [0, 7, 17], "issu": [9, 20, 34], "iv": 31, "jemalloc": [31, 33], "jit": 10, "kernel": [17, 18], "known": [9, 20, 34], "kv": 28, "languag": [6, 7, 28], "larg": [6, 7, 28], "last": [7, 9, 18, 33], "latenc": 31, "launch": [10, 31], "launcher": [14, 32], "layout": 18, "level": [2, 17, 28], "librari": 31, "licens": 27, "linear": 28, "lint": 5, "list": 28, "llm": [2, 6, 7, 23, 28, 29, 30], "load": 20, "local": 5, "logic": 31, "low": 28, "manner": 18, "manual": 17, "matter": 18, "memori": [18, 31, 33], "method": 10, "methodologi": [13, 28], "mix": [7, 8], "mode": [6, 28, 31], "model": [6, 7, 13, 15, 18, 20, 28, 32], "modul": [2, 10, 20, 28], "motiv": 10, "multi": 32, "multipl": 31, "multistream": 20, "nativ": 18, "nchw": 18, "nchw16c": 18, "new": [6, 7, 34], "nhwc": 18, "node": 31, "non": 33, "note": 34, "numa": 33, "numactl": 33, "number": [30, 31, 33], "omp_num_thread": 33, "omp_thread_limit": 33, "onednn": [18, 33], "onli": [6, 29], "op": 8, "openmp": [31, 33], "oper": [7, 18, 19, 28], "optim": [2, 7, 10, 13, 15, 19, 28, 29], "origin": 10, "other": 34, "output": 20, "overview": [17, 28, 30, 31, 33], "path": 8, "pattern": 13, "perform": [20, 26, 30, 32, 33, 34], "physic": 31, "pin": 32, "precis": [7, 8, 28], "preload": 20, "prepar": 15, "prerequisit": 11, "primit": [18, 33], "privat": 17, "process": 17, "product": 30, "promot": 8, "prototyp": [2, 6, 7, 10, 11, 12, 14, 16, 22, 28], "pseudocod": 29, "public": 3, "pytest": 5, "python": [5, 6, 7], "pytorch": [0, 1, 5, 15, 18, 32], "qconfig": 15, "quant": 22, "quantiz": [2, 6, 7, 15, 16, 29], "quick": 23, "recip": [16, 20, 22], "refer": [6, 8], "regist": [18, 32], "regress": 26, "releas": 34, "requir": [17, 20], "resnet50": [6, 32], "result": [26, 34], "runtim": [2, 7, 20, 26], "scale": 32, "scenario": 29, "script": 31, "search": 14, "select": 17, "serial": 32, "serv": 32, "set": 20, "sgd": 21, "shape": 26, "sheet": 4, "singl": [28, 31], "smooth": [6, 16, 22], "smoothquant": 29, "softwar": [30, 33], "space": 14, "specif": [8, 17], "split": 21, "start": [23, 25, 32], "static": [6, 15], "statu": 18, "stochast": 21, "stride": 18, "struct": 17, "structur": [20, 33], "stub": 17, "support": [1, 8, 10], "target": 18, "task": 20, "tcmalloc": [31, 33], "tensor": 18, "test": 5, "thi": [32, 33], "through": 16, "throughput": 31, "tip": 5, "torch": 7, "torchdynamo": [6, 26], "torchscript": [6, 8], "torchserv": 32, "trace": 10, "train": [6, 8], "troubleshoot": 26, "tune": [14, 16, 22, 33], "type": [8, 28], "uniform": 33, "unit": 5, "us": [7, 8, 9, 10, 13, 16, 20, 31], "usag": [10, 11, 12, 14, 16, 20, 26, 29, 31], "user": 14, "v": 31, "v1": 30, "vec": 17, "verifi": 28, "version": 30, "vi": 31, "via": 28, "vii": 31, "viii": 31, "weight": [6, 29], "what": [18, 34], "widest": 8, "wip": 18, "woq": 29, "worker": 32, "write": [5, 18], "xyz": 17, "xyzkrnl": 17, "your": 31, "your_conf_fil": 14, "your_python_script": 14}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"$\\alpha$ Usage": [[16, "alpha-usage"]], "1. Creating a serialized file": [[32, "creating-a-serialized-file"]], "1. Defining hyperparameters to tune:": [[14, "defining-hyperparameters-to-tune"]], "1.0.0-Alpha": [[34, "id47"]], "1.0.1-Alpha": [[34, "alpha"]], "1.0.2": [[34, "id46"]], "1.1.0": [[34, "id44"]], "1.10.0": [[34, "id34"]], "1.10.100": [[34, "id33"]], "1.11.0": [[34, "id31"]], "1.11.200": [[34, "id29"]], "1.12.0": [[34, "id26"]], "1.12.100": [[34, "id25"]], "1.12.300": [[34, "id23"]], "1.13.0": [[34, "id20"]], "1.13.100": [[34, "id18"]], "1.2.0": [[34, "id41"]], "1.8.0": [[34, "id39"]], "1.9.0": [[34, "id38"]], "2. Creating a Model Archive": [[32, "creating-a-model-archive"]], "2. Defining the search spaces of the hyperparameters:": [[14, "defining-the-search-spaces-of-the-hyperparameters"]], "2.0.0": [[34, "id16"]], "2.0.100": [[34, "id14"]], "2.1.0": [[34, "id12"]], "2.1.100": [[34, "id10"]], "2.2.0": [[34, "id8"]], "2.3.0": [[34, "id6"]], "2.3.100": [[34, "id4"]], "2.4.0": [[34, "id2"]], "2.5.0": [[34, "id1"]], "3. Start TorchServe to serve the model": [[32, "start-torchserve-to-serve-the-model"]], "4. Registering and Deploying model": [[32, "registering-and-deploying-model"]], "": [[14, "your-python-script"]], "API Documentation": [[2, null], [25, "api-documentation"]], "Accuracy": [[30, "accuracy"]], "Add Custom Kernel": [[17, "add-custom-kernel"]], "Algorithm: Auto-tuning of $\\alpha$.": [[16, "algorithm-auto-tuning-of-alpha"]], "Already using Jit Trace": [[10, "already-using-jit-trace"]], "Already using ipex.optimize": [[10, "already-using-ipex-optimize"]], "Architecture": [[1, "architecture"]], "Auto Channels Last": [[7, "auto-channels-last"], [9, null]], "Auto Mixed Precision (AMP)": [[7, "auto-mixed-precision-amp"], [8, null]], "Autocast Op Reference": [[8, "autocast-op-reference"]], "BERT": [[6, "bert"], [6, "id2"], [6, "id4"], [6, "id7"], [6, "id10"], [6, "id13"], [32, "bert"]], "BFloat16": [[6, "bfloat16"], [21, "bfloat16"], [26, "bfloat16"]], "Benchmarking with Launcher": [[32, "benchmarking-with-launcher"]], "Benchmarking with Launcher Core Pinning": [[32, "benchmarking-with-launcher-core-pinning"]], "Better local unit tests with pytest": [[5, "better-local-unit-tests-with-pytest"]], "Blogs & Publications": [[3, null]], "Building documentation": [[5, "building-documentation"]], "C++": [[6, "c"]], "C++ Unit Testing": [[5, "c-unit-testing"]], "CPU Channels Last Targets": [[18, "cpu-channels-last-targets"]], "CPU ISA build compiler requirement": [[17, "cpu-isa-build-compiler-requirement"]], "CPU Runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime"]], "CPU feature check": [[17, "cpu-feature-check"]], "Calibration": [[6, "calibration"]], "Channels Last": [[18, null], [33, "channels-last"]], "Cheat Sheet": [[4, null]], "Code Folder Struct": [[17, "code-folder-struct"]], "CodeGen Process": [[17, "codegen-process"]], "Codeless Optimization (Prototype)": [[10, null]], "Codeless Optimization (Prototype, NEW feature from 1.13.0)": [[7, "codeless-optimization-prototype-new-feature-from-1-13-0"]], "Command to apply ipex optimization for BF16": [[10, "command-to-apply-ipex-optimization-for-bf16"]], "Command to apply ipex optimization for FP32": [[10, "command-to-apply-ipex-optimization-for-fp32"]], "Configuration": [[30, "configuration"], [30, "id2"], [30, "id5"]], "Contents of this Document": [[32, "contents-of-this-document"], [33, "contents-of-this-document"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[5, "contributing-to-intel-extension-for-pytorch"]], "Contribution": [[5, null]], "Convert to Dynamic Quantized Model and Deploy": [[15, "convert-to-dynamic-quantized-model-and-deploy"]], "Convert to Static Quantized Model and Deploy": [[15, "convert-to-static-quantized-model-and-deploy"]], "Creating and Exporting INT8 model for Intel\u00ae Extension for PyTorch*": [[32, "creating-and-exporting-int8-model-for-intel-extension-for-pytorch"]], "Default Precision": [[8, "default-precision"]], "Default memory allocator": [[31, "default-memory-allocator"]], "Default search space": [[14, "default-search-space"]], "Define QConfig": [[15, "id1"]], "Define qconfig": [[15, "define-qconfig"]], "Defining hyperparameters and their search spaces": [[14, "defining-hyperparameters-and-their-search-spaces"]], "Demos": [[28, "demos"]], "Denormal Number": [[33, "denormal-number"]], "Deployment": [[6, "deployment"]], "Design of Task": [[20, "design-of-task"]], "Detail Design": [[20, "detail-design"]], "Determining the alpha through auto-tuning": [[16, "determining-the-alpha-through-auto-tuning"]], "Developing Intel\u00ae Extension for PyTorch*": [[5, "developing-intel-extension-for-pytorch"]], "Dispatch Stub implementation: csrc/cpu/dyndisp/DispatchStub.cpp and csrc/cpu/dyndisp/DispatchStub.h": [[17, "dispatch-stub-implementation-csrc-cpu-dyndisp-dispatchstub-cpp-and-csrc-cpu-dyndisp-dispatchstub-h"]], "Distributed Inference": [[28, "distributed-inference"]], "Distributed Inference with DeepSpeed": [[29, "distributed-inference-with-deepspeed"]], "Distributed Training": [[6, "distributed-training"]], "Dynamic Dispatch Design": [[17, "dynamic-dispatch-design"]], "Dynamic Quantization": [[6, "dynamic-quantization"], [15, "dynamic-quantization"]], "Dynamic Shape": [[26, "dynamic-shape"]], "Eager Mode": [[6, "eager-mode"], [6, "id5"]], "Ease-of-use auto channels last API": [[9, "ease-of-use-auto-channels-last-api"]], "Ease-of-use graph optimization API": [[13, "ease-of-use-graph-optimization-api"]], "Easy-to-use Python API": [[7, "easy-to-use-python-api"]], "Example Usage with HuggingFace": [[10, "example-usage-with-huggingface"]], "Example of MultiStream Module": [[20, "example-of-multistream-module"]], "Example of asynchronous task": [[20, "example-of-asynchronous-task"]], "Example of configuring core binding": [[20, "example-of-configuring-core-binding"]], "Example:": [[17, "example"], [17, "id1"]], "Examples": [[6, null]], "Examples1: Basic Usage": [[20, "examples1-basic-usage"]], "Examples2: Usage with \u201cAUTO\u201d setting": [[20, "examples2-usage-with-auto-setting"]], "Examples3: Usage for models with structure inputs/outputs": [[20, "examples3-usage-for-models-with-structure-inputs-outputs"]], "FP32 and BF16 fusion patterns": [[13, "fp32-and-bf16-fusion-patterns"]], "FP32 and BF16 models": [[13, "fp32-and-bf16-models"]], "FP32 and BFloat16 with v1.10": [[30, "fp32-and-bfloat16-with-v1-10"]], "FP32 with v1.11.200 on an AWS EC2 C6i.2xlarge instance": [[30, "fp32-with-v1-11-200-on-an-aws-ec2-c6i-2xlarge-instance"]], "FP32/BF16": [[6, "fp32-bf16"], [29, "fp32-bf16"]], "Fast BERT (Prototype)": [[11, null]], "Fast BERT Optimization (Prototype, NEW feature from 2.0.0)": [[7, "fast-bert-optimization-prototype-new-feature-from-2-0-0"]], "Fast Bert (Prototype)": [[2, "fast-bert-prototype"], [6, "fast-bert-prototype"]], "Feature Description": [[11, "feature-description"], [12, "feature-description"]], "Features": [[7, null]], "Float32": [[6, "float32"]], "Folding": [[13, "folding"]], "Fusion": [[13, "fusion"]], "GNU OpenMP": [[33, "gnu-openmp"]], "GNU OpenMP Library": [[31, "gnu-openmp-library"]], "General": [[2, "general"]], "General Usage": [[26, "general-usage"]], "Get Started": [[25, "get-started"]], "Graph Capture (Prototype)": [[12, null]], "Graph Capture (Prototype, NEW feature from 1.13.0)": [[7, "graph-capture-prototype-new-feature-from-1-13-0"]], "Graph Optimization": [[2, "graph-optimization"], [7, "graph-optimization"], [13, null], [28, "graph-optimization"]], "Hardware Configuration": [[30, "hardware-configuration"], [30, "id7"], [33, "hardware-configuration"]], "Highlights": [[34, "highlights"], [34, "id3"], [34, "id5"], [34, "id7"], [34, "id9"], [34, "id11"], [34, "id13"], [34, "id15"], [34, "id17"], [34, "id19"], [34, "id21"], [34, "id24"], [34, "id27"], [34, "id30"], [34, "id32"], [34, "id35"]], "How the core binding is implemented": [[20, "how-the-core-binding-is-implemented"]], "HyperTune (Prototype)": [[14, null]], "HyperTune (Prototype, NEW feature from 1.13.0)": [[7, "hypertune-prototype-new-feature-from-1-13-0"]], "Hyperparameters": [[14, "hyperparameters"]], "I. Use all physical cores": [[31, "i-use-all-physical-cores"]], "II. Use all cores including logical cores": [[31, "ii-use-all-cores-including-logical-cores"]], "III. Use physical cores on designated nodes": [[31, "iii-use-physical-cores-on-designated-nodes"]], "INT8": [[6, "int8"], [26, "int8"]], "INT8 Quantization": [[7, "int8-quantization"]], "INT8 Recipe Tuning API (Prototype)": [[16, null]], "INT8 fusion patterns": [[13, "int8-fusion-patterns"]], "INT8 models": [[13, "int8-models"]], "INT8 with v1.11": [[30, "int8-with-v1-11"]], "IOMP preload or load during the runtime": [[20, "iomp-preload-or-load-during-the-runtime"]], "ISA Dynamic Dispatching": [[7, "isa-dynamic-dispatching"], [17, null]], "ISA intrinics specific kernel example:": [[17, "isa-intrinics-specific-kernel-example"]], "IV. Use your designated number of cores": [[31, "iv-use-your-designated-number-of-cores"]], "Indirect Access KV Cache": [[28, "indirect-access-kv-cache"]], "Inference": [[6, "inference"]], "Inference with Eager Path": [[8, "inference-with-eager-path"]], "Inference with TorchScript Path": [[8, "inference-with-torchscript-path"]], "Install Intel\u00ae Extension for PyTorch*": [[32, "install-intel-extension-for-pytorch"]], "Installation": [[24, null]], "Intel CPU Structure": [[33, "intel-cpu-structure"]], "Intel OpenMP": [[33, "intel-openmp"]], "Intel OpenMP Library": [[31, "intel-openmp-library"]], "Intel\u00ae AI Reference Models": [[6, "intel-ai-reference-models"]], "Intel\u00ae Extension for PyTorch*": [[1, null]], "Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc": [[0, null]], "Intel\u00ae Extension for PyTorch* optimizations for quantization": [[15, null]], "Introduction": [[8, "introduction"], [19, "introduction"], [25, null]], "Jemalloc": [[31, "jemalloc"], [33, "jemalloc"]], "Kernel Stub: csrc/cpu/aten/xyz.cpp and csrc/cpu/aten/xyz.h": [[17, "kernel-stub-csrc-cpu-aten-xyz-cpp-and-csrc-cpu-aten-xyz-h"]], "Kernel implementation: csrc/cpu/aten/kernels/xyzKrnl.cpp": [[17, "kernel-implementation-csrc-cpu-aten-kernels-xyzkrnl-cpp"]], "Known Issues": [[34, "known-issues"], [34, "id22"], [34, "id28"], [34, "id36"]], "Known issue": [[9, "known-issue"], [34, "known-issue"], [34, "id49"]], "Known issues": [[20, "known-issues"], [34, "id43"]], "LLM Module Level Optimizations (Prototype)": [[2, "llm-module-level-optimizations-prototype"]], "LLM Optimizations Frontend API": [[29, null]], "LLM Performance": [[30, "llm-performance"]], "LLM Quick Start": [[23, "llm-quick-start"]], "Large Language Model (LLM)": [[6, "large-language-model-llm"]], "Large Language Models (LLM) Optimization Overview": [[28, null]], "Large Language Models (LLM, NEW feature from 2.1.0)": [[7, "large-language-models-llm-new-feature-from-2-1-0"]], "Launch Script Usage Guide": [[31, null]], "Launcher Core Pinning to Boost Performance of TorchServe Multi Worker Inference": [[32, "launcher-core-pinning-to-boost-performance-of-torchserve-multi-worker-inference"]], "Launcher Hyperparameters": [[14, "launcher-hyperparameters"]], "License": [[27, null]], "Linear Operator Optimization": [[28, "linear-operator-optimization"]], "Local linting": [[5, "local-linting"]], "Low Precision Data Types": [[28, "low-precision-data-types"]], "Memory Allocator": [[33, "memory-allocator"]], "Memory Format Is All That Matters": [[18, "memory-format-is-all-that-matters"]], "Methodology": [[13, "methodology"]], "Module Level Optimization API for customized LLM (Prototype)": [[28, "module-level-optimization-api-for-customized-llm-prototype"]], "Module uses forward method explicitly instead of the __call__ attr": [[10, "module-uses-forward-method-explicitly-instead-of-the-call-attr"]], "Motivation": [[10, "motivation"]], "Multiple instances for inference": [[31, "multiple-instances-for-inference"]], "NOTE": [[34, "note"]], "Non-Uniform Memory Access (NUMA)": [[33, "non-uniform-memory-access-numa"]], "Numactl": [[33, "numactl"]], "OMP_NUM_THREADS": [[33, "omp-num-threads"]], "OMP_THREAD_LIMIT": [[33, "omp-thread-limit"]], "OneDNN primitive cache": [[33, "onednn-primitive-cache"]], "Op Eligibility": [[8, "op-eligibility"]], "Op-Specific Behavior": [[8, "op-specific-behavior"]], "OpenMP": [[33, "openmp"]], "Operation Fusion": [[19, "operation-fusion"]], "Operator Optimization": [[7, "operator-optimization"]], "Ops that can autocast to bfloat16": [[8, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float32": [[8, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[8, "ops-that-promote-to-the-widest-input-type"]], "Optimization Methodologies": [[28, "optimization-methodologies"]], "Optimizer Fusion": [[19, null]], "Optimizer Optimization": [[7, "optimizer-optimization"]], "Others": [[34, "others"]], "Overview": [[17, "overview"], [30, "overview"], [31, "overview"], [33, "overview"]], "Performance": [[30, null], [34, "performance"]], "Performance Boost with Intel\u00ae Extension for PyTorch* and Launcher": [[32, "performance-boost-with-intel-extension-for-pytorch-and-launcher"]], "Performance Data for Intel\u00ae AI Data Center Products": [[30, "performance-data-for-intel-ai-data-center-products"]], "Performance Improvement": [[34, "performance-improvement"]], "Performance Numbers": [[30, "performance-numbers"], [30, "id1"], [30, "id4"]], "Performance Regression": [[26, "performance-regression"]], "Performance Result": [[34, "performance-result"]], "Performance Tuning Guide": [[33, null]], "Performance recipes": [[20, "performance-recipes"]], "Prepare Model": [[15, "prepare-model"]], "Prepare Model and Do Calibration": [[15, "prepare-model-and-do-calibration"]], "Prerequisite": [[11, "prerequisite"]], "Private Debug APIs": [[17, "private-debug-apis"]], "Pseudocode of Common Usage Scenarios": [[29, "pseudocode-of-common-usage-scenarios"]], "PyTorch Channels Last Memory Format APIs": [[18, "pytorch-channels-last-memory-format-apis"]], "PyTorch Strided Layout": [[18, "pytorch-strided-layout"]], "Python": [[6, "python"]], "Python Unit Testing": [[5, "python-unit-testing"]], "Quantization": [[2, "module-intel_extension_for_pytorch.quantization"]], "Quick Start": [[23, null]], "Releases": [[34, null]], "Requirements": [[20, "requirements"]], "ResNet50": [[32, "resnet50"]], "Resnet50": [[6, "resnet50"], [6, "id1"], [6, "id3"], [6, "id6"], [6, "id9"], [6, "id12"]], "Result Correctness": [[26, "result-correctness"]], "Runtime Extension": [[7, "runtime-extension"], [20, null], [26, "runtime-extension"]], "Scaling workers": [[32, "scaling-workers"]], "Select ISA level manually.": [[17, "select-isa-level-manually"]], "Serving model with Intel\u00ae Extension for PyTorch*": [[32, "serving-model-with-intel-extension-for-pytorch"]], "Single instance for inference": [[31, "single-instance-for-inference"]], "Smooth Quant Recipe Tuning API (Prototype)": [[22, null]], "Smooth Quantization Autotune": [[16, "smooth-quantization-autotune"]], "Smooth Quantization INT8": [[6, "smooth-quantization-int8"]], "SmoothQuant": [[29, "smoothquant"]], "Software Configuration": [[33, "software-configuration"]], "Software Version": [[30, "software-version"], [30, "id3"], [30, "id6"]], "Split SGD": [[21, null], [21, "id2"]], "Static Quantization": [[6, "static-quantization"], [15, "static-quantization"]], "Stochastic Gradient Descent (SGD)": [[21, "stochastic-gradient-descent-sgd"]], "Support": [[1, "support"]], "TCMalloc": [[31, "tcmalloc"], [33, "tcmalloc"]], "The origin command with ipex launch": [[10, "the-origin-command-with-ipex-launch"]], "Tips": [[5, "tips"]], "Tips and Debugging": [[5, "tips-and-debugging"]], "TorchDynamo": [[26, "torchdynamo"]], "TorchDynamo Mode (Beta, NEW feature from 2.0.0)": [[6, "torchdynamo-mode-beta-new-feature-from-2-0-0"], [6, "id11"]], "TorchScript Mode": [[6, "torchscript-mode"], [6, "id8"]], "TorchServe with Intel\u00ae Extension for PyTorch*": [[32, null]], "TorchServe with Launcher": [[32, "torchserve-with-launcher"]], "Training": [[6, "training"]], "Training Support": [[8, "training-support"]], "Troubleshooting": [[26, null]], "Unit testing": [[5, "unit-testing"]], "Usage Example": [[11, "usage-example"], [12, "usage-example"], [16, "usage-example"]], "Usage Examples": [[14, "usage-examples"], [31, "usage-examples"]], "Usage of Hypertune": [[14, "usage-of-hypertune"]], "Usage of Jemalloc/TCMalloc/Default memory allocator": [[31, "usage-of-jemalloc-tcmalloc-default-memory-allocator"]], "Usage of OpenMP library": [[31, "usage-of-openmp-library"]], "Usage of launch script": [[31, "usage-of-launch-script"]], "Use Case": [[8, "use-case"]], "Use Case not supported": [[10, "use-case-not-supported"]], "Use Cases": [[20, "use-cases"]], "User defined search space": [[14, "user-defined-search-space"]], "Using a fixed alpha": [[16, "using-a-fixed-alpha"]], "V. Throughput mode": [[31, "v-throughput-mode"]], "VI. Latency mode": [[31, "vi-latency-mode"]], "VII. Your designated number of instances": [[31, "vii-your-designated-number-of-instances"]], "VIII. Your designated number of instances and instance index": [[31, "viii-your-designated-number-of-instances-and-instance-index"]], "Vec specific kernel example:": [[17, "vec-specific-kernel-example"]], "Verified for distributed inference mode via DeepSpeed": [[28, "verified-for-distributed-inference-mode-via-deepspeed"]], "Verified for single instance mode": [[28, "verified-for-single-instance-mode"]], "Weight Only Quantization (WOQ)": [[29, "weight-only-quantization-woq"]], "Weight Only Quantization INT8/INT4": [[6, "weight-only-quantization-int8-int4"]], "What is Channels Last": [[18, "what-is-channels-last"]], "What\u2019s Changed": [[34, "what-s-changed"], [34, "id37"]], "What\u2019s New": [[34, "what-s-new"], [34, "id40"], [34, "id42"], [34, "id45"], [34, "id48"]], "Writing Channels Last Kernels": [[18, "writing-channels-last-kernels"]], "Writing documentation": [[5, "writing-documentation"]], "a. Create NHWC Memory": [[18, "a-create-nhwc-memory"]], "a. NCHW (default)": [[18, "a-nchw-default"]], "a. Status on CPU": [[18, "a-status-on-cpu"]], "a. tensor creation": [[18, "a-tensor-creation"]], "b. Create Convolution Primitive": [[18, "b-create-convolution-primitive"]], "b. NHWC (WIP for CPU)": [[18, "b-nhwc-wip-for-cpu"]], "b. Register Channels Last Kernel in ATen Native Manner": [[18, "b-register-channels-last-kernel-in-aten-native-manner"]], "b. tensor conversion": [[18, "b-tensor-conversion"]], "c. Blocked (nChw16c)": [[18, "c-blocked-nchw16c"]], "c. Register oneDNN Kernel on Channels Last": [[18, "c-register-onednn-kernel-on-channels-last"]], "c. model conversion": [[18, "c-model-conversion"]], "d. operator coverage": [[18, "d-operator-coverage"]], "default": [[9, "default"]], "disable": [[9, "disable"]], "enable": [[9, "enable"]], "ipex.llm Optimized Model List for Inference": [[28, "ipex-llm-optimized-model-list-for-inference"]], "oneDNN NHWC APIs": [[18, "onednn-nhwc-apis"]], "torch.compile (Beta, NEW feature from 2.0.0)": [[7, "torch-compile-beta-new-feature-from-2-0-0"]], "your_conf_file": [[14, "your-conf-file"]]}, "docnames": ["design_doc/cpu/isa_dyndisp", "index", "tutorials/api_doc", "tutorials/blogs_publications", "tutorials/cheat_sheet", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/amp", "tutorials/features/auto_channels_last", "tutorials/features/codeless_optimization", "tutorials/features/fast_bert", "tutorials/features/graph_capture", "tutorials/features/graph_optimization", "tutorials/features/hypertune", "tutorials/features/int8_overview", "tutorials/features/int8_recipe_tuning_api", "tutorials/features/isa_dynamic_dispatch", "tutorials/features/nhwc", "tutorials/features/optimizer_fusion", "tutorials/features/runtime_extension", "tutorials/features/split_sgd", "tutorials/features/sq_recipe_tuning_api", "tutorials/getting_started", "tutorials/installation", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/llm_optimize", "tutorials/performance", "tutorials/performance_tuning/launch_script", "tutorials/performance_tuning/torchserve", "tutorials/performance_tuning/tuning_guide", "tutorials/releases"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["design_doc/cpu/isa_dyndisp.md", "index.rst", "tutorials/api_doc.rst", "tutorials/blogs_publications.md", "tutorials/cheat_sheet.md", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/amp.md", "tutorials/features/auto_channels_last.md", "tutorials/features/codeless_optimization.md", "tutorials/features/fast_bert.md", "tutorials/features/graph_capture.md", "tutorials/features/graph_optimization.md", "tutorials/features/hypertune.md", "tutorials/features/int8_overview.md", "tutorials/features/int8_recipe_tuning_api.md", "tutorials/features/isa_dynamic_dispatch.md", "tutorials/features/nhwc.md", "tutorials/features/optimizer_fusion.md", "tutorials/features/runtime_extension.md", "tutorials/features/split_sgd.rst", "tutorials/features/sq_recipe_tuning_api.md", "tutorials/getting_started.md", "tutorials/installation.md", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/llm_optimize.md", "tutorials/performance.md", "tutorials/performance_tuning/launch_script.md", "tutorials/performance_tuning/torchserve.md", "tutorials/performance_tuning/tuning_guide.md", "tutorials/releases.md"], "indexentries": {"autotune() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.autotune", false]], "convert() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.convert", false]], "cpupool (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.CPUPool", false]], "enable_onednn_fusion() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.enable_onednn_fusion", false]], "fast_bert() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.fast_bert", false]], "fast_layer_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.fast_layer_norm", false]], "fastlayernorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.FastLayerNorm", false]], "frozenbatchnorm2d (class in intel_extension_for_pytorch.nn)": [[7, "intel_extension_for_pytorch.nn.FrozenBatchNorm2d", false]], "get_core_list_of_node_id() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.get_core_list_of_node_id", false]], "get_smooth_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_smooth_quant_qconfig_mapping", false]], "get_weight_only_quant_qconfig_mapping() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.get_weight_only_quant_qconfig_mapping", false]], "indirect_access_kv_cache_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.indirect_access_kv_cache_attention", false]], "indirectaccesskvcacheattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.IndirectAccessKVCacheAttention", false]], "intel_extension_for_pytorch": [[2, "module-intel_extension_for_pytorch", false]], "intel_extension_for_pytorch.cpu.runtime": [[2, "module-intel_extension_for_pytorch.cpu.runtime", false]], "intel_extension_for_pytorch.llm": [[2, "module-intel_extension_for_pytorch.llm", false]], "intel_extension_for_pytorch.llm.functional": [[2, "module-intel_extension_for_pytorch.llm.functional", false]], "intel_extension_for_pytorch.llm.modules": [[2, "module-intel_extension_for_pytorch.llm.modules", false]], "intel_extension_for_pytorch.quantization": [[2, "module-intel_extension_for_pytorch.quantization", false]], "interaction() (in module intel_extension_for_pytorch.nn.functional)": [[7, "intel_extension_for_pytorch.nn.functional.interaction", false]], "is_runtime_ext_enabled() (in module intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.is_runtime_ext_enabled", false]], "linear2silumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.Linear2SiluMul", false]], "linearadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAdd", false]], "linearaddadd (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearAddAdd", false]], "lineargelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearGelu", false]], "linearmul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearMul", false]], "linearnewgelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearNewGelu", false]], "linearrelu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearRelu", false]], "linearsilu (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSilu", false]], "linearsilumul (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.LinearSiluMul", false]], "mergedembeddingbag (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBag", false]], "mergedembeddingbagwithsgd (class in intel_extension_for_pytorch.nn.modules)": [[7, "intel_extension_for_pytorch.nn.modules.MergedEmbeddingBagWithSGD", false]], "module": [[2, "module-intel_extension_for_pytorch", false], [2, "module-intel_extension_for_pytorch.cpu.runtime", false], [2, "module-intel_extension_for_pytorch.llm", false], [2, "module-intel_extension_for_pytorch.llm.functional", false], [2, "module-intel_extension_for_pytorch.llm.modules", false], [2, "module-intel_extension_for_pytorch.quantization", false]], "multistreammodule (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModule", false]], "multistreammodulehint (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.MultiStreamModuleHint", false]], "optimize() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.optimize", false]], "optimize() (in module intel_extension_for_pytorch.llm)": [[2, "intel_extension_for_pytorch.llm.optimize", false]], "pagedattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.PagedAttention", false]], "pin (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.pin", false]], "prepare() (in module intel_extension_for_pytorch.quantization)": [[2, "intel_extension_for_pytorch.quantization.prepare", false]], "rms_norm() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rms_norm", false]], "rmsnorm (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RMSNorm", false]], "rotary_embedding() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.rotary_embedding", false]], "rotaryembedding (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.RotaryEmbedding", false]], "task (class in intel_extension_for_pytorch.cpu.runtime)": [[2, "intel_extension_for_pytorch.cpu.runtime.Task", false]], "varlen_attention() (in module intel_extension_for_pytorch.llm.functional)": [[2, "intel_extension_for_pytorch.llm.functional.varlen_attention", false]], "varlenattention (class in intel_extension_for_pytorch.llm.modules)": [[2, "intel_extension_for_pytorch.llm.modules.VarlenAttention", false]], "verbose (class in intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.verbose", false]]}, "objects": {"": [[2, 0, 0, "-", "intel_extension_for_pytorch"]], "intel_extension_for_pytorch": [[2, 2, 1, "", "enable_onednn_fusion"], [2, 2, 1, "", "fast_bert"], [2, 0, 0, "-", "llm"], [2, 2, 1, "", "optimize"], [2, 0, 0, "-", "quantization"], [2, 1, 1, "", "verbose"]], "intel_extension_for_pytorch.cpu": [[2, 0, 0, "-", "runtime"]], "intel_extension_for_pytorch.cpu.runtime": [[2, 1, 1, "", "CPUPool"], [2, 1, 1, "", "MultiStreamModule"], [2, 1, 1, "", "MultiStreamModuleHint"], [2, 1, 1, "", "Task"], [2, 2, 1, "", "get_core_list_of_node_id"], [2, 2, 1, "", "is_runtime_ext_enabled"], [2, 1, 1, "", "pin"]], "intel_extension_for_pytorch.llm": [[2, 0, 0, "-", "functional"], [2, 0, 0, "-", "modules"], [2, 2, 1, "", "optimize"]], "intel_extension_for_pytorch.llm.functional": [[2, 2, 1, "", "fast_layer_norm"], [2, 2, 1, "", "indirect_access_kv_cache_attention"], [2, 2, 1, "", "rms_norm"], [2, 2, 1, "", "rotary_embedding"], [2, 2, 1, "", "varlen_attention"]], "intel_extension_for_pytorch.llm.modules": [[2, 1, 1, "", "FastLayerNorm"], [2, 1, 1, "", "IndirectAccessKVCacheAttention"], [2, 1, 1, "", "Linear2SiluMul"], [2, 1, 1, "", "LinearAdd"], [2, 1, 1, "", "LinearAddAdd"], [2, 1, 1, "", "LinearGelu"], [2, 1, 1, "", "LinearMul"], [2, 1, 1, "", "LinearNewGelu"], [2, 1, 1, "", "LinearRelu"], [2, 1, 1, "", "LinearSilu"], [2, 1, 1, "", "LinearSiluMul"], [2, 1, 1, "", "PagedAttention"], [2, 1, 1, "", "RMSNorm"], [2, 1, 1, "", "RotaryEmbedding"], [2, 1, 1, "", "VarlenAttention"]], "intel_extension_for_pytorch.nn": [[7, 1, 1, "", "FrozenBatchNorm2d"]], "intel_extension_for_pytorch.nn.functional": [[7, 2, 1, "", "interaction"]], "intel_extension_for_pytorch.nn.modules": [[7, 1, 1, "", "MergedEmbeddingBag"], [7, 1, 1, "", "MergedEmbeddingBagWithSGD"]], "intel_extension_for_pytorch.quantization": [[2, 2, 1, "", "autotune"], [2, 2, 1, "", "convert"], [2, 2, 1, "", "get_smooth_quant_qconfig_mapping"], [2, 2, 1, "", "get_weight_only_quant_qconfig_mapping"], [2, 2, 1, "", "prepare"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "function", "Python function"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:function"}, "terms": {"": [2, 3, 5, 8, 10, 14, 15, 18, 19, 20, 21, 22, 26, 31, 32, 33], "0": [1, 2, 4, 5, 8, 10, 11, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 30, 31, 32, 33], "00": [31, 34], "00000": 21, "00000000000602e7": 17, "0000012345": 21, "001": [6, 8], "0016": 30, "01": [2, 4, 7, 16, 31, 32, 34], "02": [30, 32], "02x": 30, "03": 32, "03x": 30, "04": [30, 31], "04x": 30, "05": [2, 7, 10, 30, 31], "05x": 30, "06": [2, 31, 32], "06x": 30, "07": 31, "07x": 30, "08": 31, "08x": 30, "09": [17, 31], "096": 32, "09864": 2, "09x": 30, "0x00007f3cde954000": 6, "0x00007f3ce16ac000": 6, "0x00007f3cf70fc000": 6, "0x00007f3cf985a000": 6, "0x00007f3cf98e0000": 6, "0x1": 17, "0x700001c": 30, "0x7fff": 17, "0xd0002a0": 30, "0xffff": 17, "1": [1, 2, 3, 4, 6, 8, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 33], "10": [7, 14, 16, 17, 18, 21, 25, 26, 31, 32, 33], "100": [2, 4, 14, 16, 17, 30, 32], "10000": 2, "1009": 30, "100mb": 34, "1024": [30, 33], "102b": 28, "1032": 34, "10438": 2, "1053": 34, "1074": 34, "10k": 6, "10x": 30, "11": [17, 31, 32], "111": 33, "112": [26, 30, 33, 34], "117": 31, "118": 31, "11b": [28, 34], "11x": 30, "12": [6, 10, 14, 17, 30, 31, 32], "1200": 30, "12345": 21, "1234500000": 21, "1234512345": 21, "125m": 6, "127": [6, 31, 34], "128": [6, 8, 10, 13, 20, 30, 34], "128k": [2, 28, 34], "128task": 30, "1295": 34, "12b": 28, "12x": 30, "13": [3, 10, 17, 30, 31, 32, 33], "1318": 34, "1322": 34, "1328": 34, "1330": 34, "1338": 34, "1341": 34, "1353": 34, "1355": 34, "1367": 34, "1373": 34, "1376": 34, "1384": 34, "1391": 34, "1392": 34, "13b": [28, 30, 34], "13x": 30, "14": [31, 32, 34], "140": 31, "1414": 34, "1419": 34, "143": 31, "146": 31, "1473": 34, "1488": 34, "149": 31, "14x": 30, "15": [14, 17, 30, 31, 32], "151": 31, "1513": 34, "1517": 34, "154": 31, "1563": 34, "1564": 34, "1566": 34, "1568": 34, "157": 31, "1580": 34, "1585": 34, "1587": 34, "1589": 34, "159": 31, "1590": 34, "1592": 34, "1593": 34, "1594": 34, "15x": 30, "16": [2, 17, 20, 21, 30, 31, 32], "160": 30, "162": 31, "164": 31, "1664": 34, "167": 31, "1677": 34, "1682": 34, "1688": 34, "1695": 34, "16gb": 30, "16x": 30, "16xlarg": 30, "17": [6, 30, 31, 32], "170": 30, "175": 31, "176": 31, "177": 31, "17th": 30, "18": [30, 31, 32], "18x": 30, "19": [7, 30, 31, 32, 34], "199": 30, "19x": 30, "1_6b": 28, "1b": 34, "1b7": 28, "1d": 18, "1e": [2, 7, 10, 16], "1mb": 33, "2": [1, 2, 3, 8, 10, 16, 17, 18, 20, 21, 25, 26, 27, 28, 29, 30, 31, 33], "20": [2, 7, 18, 30, 31, 32, 34], "2006080250": 30, "200m": 33, "2017": 3, "2019": 3, "2020": 3, "2021": [3, 17, 31, 32], "2022": [3, 31, 32], "2023": [2, 3, 30], "2024": 33, "2048": [2, 6], "205": 34, "20b": 28, "20x": 30, "21": [30, 31, 32], "2104": 2, "2105": 30, "2137": 34, "2195": 34, "2198": 34, "21x": 30, "22": [6, 30, 31, 32], "220m": 34, "220mb": 34, "2211": 2, "2229": 34, "223": 32, "2236": 34, "224": [6, 8, 10, 12, 13, 30, 32, 34], "224m": 34, "2251": 34, "2253": 34, "2257": 34, "2264": 34, "2275": 34, "2278": 34, "2280": 34, "2283": 34, "2290": 34, "2292": 34, "2299": 34, "23": [21, 31, 32], "2315": 34, "2317": 34, "2319": 34, "233": 31, "2334": 34, "2349": 34, "235": 31, "236": 31, "2392": 34, "24": [31, 32], "2412": 34, "2433": 34, "244": 13, "2468": 34, "2469": 34, "2473": 34, "2476": 34, "2480": 34, "2491": 34, "24x": 30, "24xlarg": 32, "25": [31, 32], "2511": 34, "2550": 34, "256": [2, 30], "2561": 34, "2568": 34, "256gb": 30, "2584": 34, "26": [30, 31, 32], "2613": 34, "2617": 34, "2627": 34, "2631": 34, "2641": 34, "2663": 34, "2666": 33, "2675": 34, "26x": 30, "27": [31, 32, 33], "2704": 34, "2733": 34, "274": 32, "2747": 34, "278": 34, "27x": 30, "28": [10, 14, 16, 30, 31, 32, 33, 34], "2883": 34, "29": [7, 31, 32], "2910": 34, "2911": 34, "2928": 34, "29500": [6, 31], "2985": 34, "2987": 34, "29x": 30, "2b": 28, "2d": 18, "2nd": 28, "2x": 34, "3": [2, 5, 6, 7, 8, 10, 12, 13, 14, 16, 17, 18, 20, 21, 28, 30, 31, 33], "30": [31, 32], "3030": 34, "305": 30, "3079": 34, "3080": 34, "30b": 28, "30ghz": 30, "30x": 30, "31": [31, 32], "3116": 34, "3143": 34, "3185": 34, "31x": 30, "32": [2, 6, 18, 21, 23, 30, 31, 32], "3200": 30, "3209": 34, "3214": 34, "3218": 34, "3246": 34, "3248": 34, "3291": 34, "32x": 30, "32x16d": 30, "33": [17, 31, 32], "3305": 34, "3307": 34, "3333": 34, "339081764221191": 14, "33x": 30, "34": [31, 32], "35": [31, 32], "355": 31, "356": 31, "35x": 30, "36": [30, 31, 32], "36x": 30, "37": [31, 32, 34], "38": [31, 32], "384": [10, 32, 34], "384task": 30, "38x": 30, "39": [30, 31, 32, 34], "39x": 30, "3b": [28, 34], "3d": 34, "3e": [10, 34], "3rd": [3, 7, 21, 30, 34], "4": [2, 6, 11, 13, 14, 18, 20, 23, 28, 30, 31, 33], "40": [30, 31, 32, 34], "407": 34, "409": 26, "4096": [2, 33], "40b": 28, "40mb": 34, "41": [31, 32], "42": [31, 32], "425": 34, "43": [6, 11, 31, 32], "432": 34, "438": 34, "44": [30, 31, 32], "44x": 30, "45": [6, 11, 31, 32], "452": 34, "45x": 30, "46": [31, 32], "47": [31, 32], "470": 31, "471": 31, "473": 31, "476": 31, "479": 31, "47x": 30, "48": [30, 31, 32], "48x": 30, "49": [30, 31, 32], "49786": 34, "4bit": 34, "4k": 28, "4th": [28, 30], "4x": 3, "5": [2, 6, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 26, 28, 30, 31, 32, 33], "50": [18, 31, 32], "50ghz": 33, "51": [31, 32], "512": [1, 6, 11, 16, 25, 28, 31], "513": 31, "52": [31, 32], "524": 34, "53": [31, 32], "531": 34, "54": [31, 32], "55": [31, 32, 33], "551": 34, "55x": 30, "56": [30, 31, 32, 33], "57": 31, "57x": 30, "58": [17, 31], "589": 34, "58x": 30, "59": 31, "591": 31, "5d": 16, "5m": 34, "5mb": 34, "5rc3": 34, "5x": 34, "6": [2, 5, 7, 11, 14, 20, 30, 31, 32, 33, 34], "60": 31, "602": 34, "61": 31, "62": 31, "62x": 30, "63": [31, 34], "64": [2, 8, 10, 16, 20, 30, 31, 34], "642": 34, "647": 34, "648": 34, "64byte": 34, "64gb": 30, "65": 31, "654": 31, "655": 31, "65536": 33, "657": 34, "66": [17, 31, 34], "67": [30, 31, 34], "674": 34, "67x": 30, "68": [31, 34], "684": 34, "685": 34, "68m": 34, "69": [30, 31], "692": 34, "6b": [2, 28, 30], "7": [10, 14, 17, 20, 21, 31, 32, 34], "70": 31, "70b": [28, 34], "71": 31, "711": 34, "71x": 30, "72": 31, "73": 31, "74": 31, "75": [30, 31], "75x": 30, "76": [30, 31], "760": [31, 32], "761": [31, 32], "762": 32, "763": 32, "764": 31, "768gb": 30, "77": 31, "77x": 30, "78": [30, 31], "784": 31, "787": 34, "78x": 30, "79": [30, 31], "7b": [6, 28, 30, 34], "7f": 16, "7m": 34, "7x": 34, "8": [14, 16, 30, 31, 32, 33], "80": [5, 30, 31], "81": [30, 31], "8180": 32, "8180m": [14, 33], "81x": 30, "82": 31, "822": 34, "83": [31, 33], "8375c": 32, "8380": 30, "8380h": 30, "83x": 30, "84": [6, 30, 31, 33], "85": [30, 31], "85x": 30, "86": [30, 31], "87": 31, "88": 31, "8b": 28, "8x": 18, "8x7b": 28, "9": [6, 7, 14, 17, 23, 25, 31, 32], "9000": 32, "9000000000": [31, 33], "9001": 32, "9002": 32, "9003": 32, "90b": 34, "90ghz": 30, "92": 30, "93": 30, "96": 30, "96x": 30, "97": 30, "975": 32, "98": 30, "981": 32, "982": 32, "99": [16, 30, 34], "992": 34, "A": [2, 5, 6, 7, 10, 11, 17, 26, 28, 31, 33, 34], "And": [15, 20, 32, 34], "As": [10, 19, 20, 28, 31, 32, 33, 34], "At": [7, 17], "But": [17, 18], "By": [17, 31, 33], "For": [1, 2, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 28, 29, 31, 32, 33, 34], "If": [2, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 20, 26, 31, 32, 33, 34], "In": [1, 2, 6, 7, 8, 12, 16, 17, 18, 19, 21, 23, 28, 31, 32, 33, 34], "It": [2, 6, 7, 8, 10, 13, 17, 18, 20, 21, 23, 26, 29, 31, 33, 34], "Its": 28, "NOT": [18, 31], "No": [2, 18, 34], "Not": 2, "ON": 30, "On": [1, 2, 7, 18, 28, 33], "One": [2, 3, 18, 19, 31, 33], "Such": 17, "The": [0, 1, 2, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "Then": 32, "There": [14, 16, 20, 33, 34], "These": [1, 5, 6, 7, 8, 13, 28, 34], "To": [2, 5, 6, 7, 10, 13, 15, 16, 17, 18, 20, 21, 23, 28, 32, 33, 34], "Will": [6, 18], "With": [1, 2, 7, 10, 20, 31, 34], "_": [13, 15, 16, 17, 18, 20, 30, 31, 32, 33, 34], "___": 13, "_____": 13, "__init__": [5, 6, 8, 10, 16, 20, 26, 34], "__m256i": 17, "__m512": 17, "__m512i": 17, "__main__": [26, 31, 32, 34], "__name__": [26, 34], "_appli": 18, "_build": 5, "_c": [17, 26], "_cmp_ord_q": 17, "_core": 31, "_cvt_fp32_to_bf16": 17, "_get_current_isa_level": 17, "_get_highest_binary_support_isa_level": 17, "_get_highest_cpu_support_isa_level": 17, "_jit_set_texpr_fuser_en": 26, "_lu_with_info": 8, "_mm256_mask_storeu_epi16": 17, "_mm256_storeu_si256": 17, "_mm512_add_epi32": 17, "_mm512_and_si512": 17, "_mm512_castps_si512": 17, "_mm512_cmp_ps_mask": 17, "_mm512_cvtneps_pbh": 17, "_mm512_cvtusepi32_epi16": 17, "_mm512_loadu_p": 17, "_mm512_mask_blend_epi32": 17, "_mm512_maskz_loadu_p": 17, "_mm512_set1_epi32": 17, "_mm512_srli_epi32": 17, "_native_multi_head_attent": 8, "_reorder_cach": 2, "_sym": 2, "_timestamp_inst": 31, "_timestamp_instance_": 31, "ab": [13, 32], "abi": [6, 17, 34], "abil": 16, "abl": 15, "abnorm": [26, 34], "about": [1, 2, 5, 7, 13, 16, 32, 33, 34], "abov": [2, 5, 10, 19, 28, 30, 31, 32], "absolut": [2, 31], "abstract": [2, 11, 20], "acceler": [1, 2, 3, 6, 7, 13, 28, 29, 30, 34], "accept": [2, 34], "access": [2, 6, 7, 18, 19, 32, 34], "accommod": 18, "accompani": 34, "accord": [2, 13, 28, 33, 34], "accordingli": 16, "account": 6, "accu": 16, "accumul": 2, "accur": 8, "accuraci": [2, 3, 6, 7, 8, 15, 16, 21, 22, 26, 28, 34], "accuracy_criterion": [2, 4, 16, 34], "accuracy_criterion_typ": 2, "accuracy_criterion_valu": 2, "achang": 15, "achiev": [1, 2, 6, 7, 28, 33, 34], "across": [16, 34], "act": 34, "act_ic_observ": 2, "act_observ": 2, "act_quant_mod": 2, "action": [6, 23], "activ": [2, 6, 7, 15, 16, 20, 28, 31, 33, 34], "actual": [18, 21], "acycl": 13, "ad": [2, 7, 10, 33, 34], "adagrad": [19, 21], "adagrad_fused_step": 19, "adagrad_step": 19, "adam": 34, "adapt": 7, "adaptive_avg_pool3d": 8, "adaptive_max_pool3d": 8, "adaptiveaveragepoolingkrnl": 17, "add": [2, 5, 7, 8, 13, 14, 19, 21, 32, 34], "add_": 19, "add_argu": [6, 23], "add_casual_mask": 2, "add_execut": 6, "add_help": [6, 23], "addbmm": 8, "addcdiv_": 19, "addcmul_": 19, "addit": [2, 6, 7, 17, 21, 28, 34], "addition": 32, "addlayernorm": 34, "addmm": 8, "addmm_": 8, "addr": 31, "address": [7, 18, 31, 32, 33, 34], "addtion": 17, "adjust": 16, "adopt": [28, 34], "advanc": [1, 2, 6, 7, 16, 25, 28], "advantag": [1, 2, 7, 9, 12, 18, 21, 25, 30, 31, 33], "aes_ni": 17, "affect": [2, 31], "affin": [7, 10, 15, 20, 31, 32, 33], "affinit": 32, "after": [2, 5, 7, 13, 20, 21, 23, 24, 32, 33, 34], "afterward": [31, 33], "ag": 7, "again": [5, 19, 32], "against": 6, "agre": 5, "ahead": 5, "ai": [1, 2, 3, 7, 28], "aim": [7, 10, 16, 33], "aka": [7, 18], "albert": 34, "algorithm": [2, 13, 18, 30, 34], "alia": 2, "alibi": 2, "alibi_slop": 2, "align": [17, 18, 21, 34], "aliv": 32, "all": [2, 5, 6, 8, 13, 14, 17, 19, 20, 28, 29, 32, 33, 34], "all_logical_cor": 14, "all_physical_cor": 14, "allcat": 2, "allenai": 26, "alloc": [2, 10, 20, 28, 30, 32, 34], "allow": [2, 8, 14, 16, 22, 33, 34], "allreduc": 2, "almost": 18, "along": [2, 5, 6, 21, 33, 34], "alpha": [2, 6, 19, 22], "alpha_max": [16, 22], "alpha_min": [16, 22], "alpha_step": [16, 22], "alphafold2": 34, "alreadi": [1, 5, 6, 18, 28, 33], "also": [1, 2, 6, 7, 10, 13, 14, 16, 18, 19, 28, 30, 31, 33, 34], "altern": [2, 6, 18], "although": [2, 33], "alwai": [5, 6, 7, 8, 18, 31, 33, 34], "amazon": 32, "among": [2, 31, 32, 33], "amount": [2, 16, 26, 28, 33], "amp": [4, 6, 10, 23, 26, 34], "amp_dtyp": [6, 23], "amp_en": [6, 23], "ampconf": 34, "amplifi": 1, "amx": [1, 3, 6, 7, 17, 25, 28, 30], "amx_bf16": 17, "amx_int8": 17, "amx_til": 17, "an": [1, 2, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 26, 31, 32, 33, 34], "anaconda": 17, "analysi": 33, "ani": [2, 5, 8, 10, 17, 18, 32, 34], "announc": 34, "anonym": 17, "anoth": [14, 31, 33, 34], "answer": [18, 30], "anymor": [7, 34], "anyplac": 4, "ao": [2, 6, 15], "apach": [27, 32], "api": [1, 3, 6, 10, 11, 15, 20, 26, 33, 34], "app": [6, 34], "append": [6, 7], "append_torchlib_if_found": 6, "appli": [2, 6, 7, 8, 12, 13, 16, 18, 19, 21, 23, 26, 28, 29, 31, 34], "applic": [1, 2, 7, 20, 28, 32, 33], "apply_funct": 2, "appropri": 33, "apr": 3, "ar": [1, 2, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34], "arang": [2, 6, 16], "arbitrari": 2, "arc": 3, "architectur": [2, 28, 30, 33], "area": [7, 14], "aren": 5, "arg": [2, 4, 6, 7, 14, 16, 19, 23, 31, 32, 34], "argc": 6, "argmax": 16, "argpars": [6, 23], "argument": [2, 6, 7, 22, 26, 31], "argumentpars": [6, 23], "argv": 6, "around": 31, "arrai": 18, "articl": [30, 33], "arxiv": 2, "ask": 5, "assign": [18, 31, 32, 33], "assum": [2, 7, 8, 23, 32, 33, 34], "asu": 33, "asymmetr": 2, "async": [20, 34], "asynchron": [2, 7], "aten": [2, 6, 7, 34], "aten_cpu_cap": 17, "attach": 33, "attent": [1, 2, 7, 28, 34], "attention_mask": [2, 6], "attention_mask_pad": 6, "attn_implement": [6, 11], "attn_output": 2, "attn_weight": 2, "attribut": 18, "aug": [3, 30], "auto": [2, 6, 10, 17, 18, 22, 23, 26, 28, 31, 33, 34], "auto_alpha_arg": 16, "auto_ipex": 34, "auto_kernel_select": [2, 7, 30], "autocast": [4, 6, 7, 10, 23, 34], "autoclass": 5, "autoconfig": [6, 23], "autofunct": 5, "autom": [4, 7, 8, 14, 31, 32, 34], "automat": [1, 2, 6, 7, 9, 10, 12, 13, 15, 16, 18, 22, 28, 31, 32, 33, 34], "automaticlli": 2, "automixprecis": 34, "automodelforcausallm": [6, 23, 29, 34], "autotoken": [6, 23], "autotp": 28, "autotun": [2, 4, 22, 34], "avaiabl": 2, "avail": [1, 2, 6, 7, 11, 17, 20, 22, 23, 29, 31, 33, 34], "avg_pool3d": 8, "avoid": [2, 10, 20, 21, 26, 31, 32, 33, 34], "avx": [1, 6, 17, 25, 28], "avx2": [17, 26, 34], "avx256": 17, "avx2_vnni": 17, "avx512": [7, 17, 18, 32, 34], "avx512_4fmap": 17, "avx512_4vnniw": 17, "avx512_bf16": 17, "avx512_bitalg": 17, "avx512_bw": 17, "avx512_cd": 17, "avx512_core_vnni": 34, "avx512_dq": 17, "avx512_er": 17, "avx512_f": 17, "avx512_fp16": 17, "avx512_ifma": 17, "avx512_pf": 17, "avx512_vbmi": 17, "avx512_vbmi2": 17, "avx512_vl": 17, "avx512_vnni": 17, "avx512_vp2intersect": 17, "avx512_vpclmul": 17, "avx512_vpopcntdq": 17, "avx_vnni": 17, "awar": [18, 20, 31, 32], "awq": [2, 34], "b": [7, 8, 16, 28], "back": [6, 12, 17, 18, 21, 26], "backbon": 2, "backend": [1, 2, 3, 6, 7, 12, 13, 16, 17, 23, 26, 28, 31, 33, 34], "background": 33, "background_thread": [31, 33], "backpropag": 16, "backward": [6, 7, 8, 16, 21, 33, 34], "bactchnorm": 34, "baddbmm": 8, "bag": [26, 34], "baichuan": [2, 28, 34], "baichuan2": [28, 34], "bake": 34, "balanc": [7, 16, 22, 33], "bandwidth": [28, 34], "base": [1, 2, 3, 4, 5, 6, 7, 10, 11, 17, 20, 21, 26, 28, 29, 30, 32, 33, 34], "base_dir": 29, "base_text_classif": 30, "baselin": [16, 22, 34], "basic": [2, 4, 16, 21, 33, 34], "batch": [2, 6, 7, 13, 16, 18, 20, 23, 26, 30, 32, 34], "batch_decod": [6, 23], "batch_id": 6, "batch_idx": [6, 13], "batch_siz": [2, 6, 11, 13, 16, 18, 23, 32], "batchnorm": [13, 17, 18, 26, 34], "batchnorm2d": [7, 10, 26, 34], "batchsiz": [2, 20], "beam": [2, 28], "beam_idx": 2, "beam_idx_tmp": 6, "beam_width": 28, "becam": 34, "becaus": [8, 17, 18, 21, 28, 33, 34], "becom": [7, 28, 33], "been": [0, 1, 6, 7, 10, 17, 18, 28, 31, 33, 34], "beeter": 28, "befor": [1, 2, 5, 6, 13, 14, 17, 18, 20, 31, 33, 34], "begin": 5, "beginn": 16, "behavior": [2, 20, 31, 33], "behaviour": 10, "being": [7, 33], "believ": [8, 18], "below": [6, 8, 10, 14, 19, 20, 21, 22, 23, 26, 28, 31, 32, 33, 34], "bench": 32, "benchmark": [6, 26, 30, 31, 34], "benefici": 18, "benefit": [6, 7, 8, 10, 20, 21, 28, 32, 33, 34], "benifit": 2, "bert": [3, 4, 10, 30, 34], "bert_int8_jit": 32, "bert_ipex_int8": 32, "bertmodel": [4, 6, 11, 32], "bertmodelmodel": 4, "besid": [28, 33, 34], "best": [2, 6, 7, 8, 14, 16, 17, 22, 24, 28, 33, 34], "beta": [23, 26], "better": [1, 2, 6, 7, 15, 18, 20, 28, 31, 32, 33, 34], "between": [7, 8, 17, 20, 33, 34], "beyond": 7, "bf16": [2, 3, 7, 17, 19, 21, 23, 26, 28, 30, 34], "bf16_gw": 21, "bf16_w": 21, "bfloat16": [2, 3, 4, 7, 10, 11, 17, 18, 23, 29, 31, 34], "bfp16": 34, "bia": [2, 8, 20, 34], "big": [7, 18], "bigcod": 28, "bigscienc": 28, "bin": [5, 6, 17, 31, 32], "binari": [5, 6, 7, 8, 17, 34], "binary_cross_entropi": 8, "binary_cross_entropy_with_logit": 8, "bind": [6, 7, 31, 32, 33, 34], "bio": 30, "bit": [21, 28], "blob": 2, "block": [2, 5, 16, 20, 22, 28, 33, 34], "block_numb": 2, "block_siz": 2, "block_tabl": 2, "blocktim": 31, "blockwis": 16, "blog": [2, 34], "bloom": [2, 28], "bmm": [8, 34], "bmp": 18, "bn": [2, 10, 15, 26, 34], "bn_fold": 2, "bodi": 17, "bool": [2, 14], "boolean": [7, 34], "booltensor": 7, "boost": [3, 6, 7, 9, 21, 30, 31, 33, 34], "both": [1, 2, 6, 7, 16, 18, 19, 21, 28, 29, 31, 32, 33, 34], "bother": 16, "bottl": 19, "bottleneck": [2, 28], "bottom": 21, "bound": [19, 20, 28, 33], "box": [6, 10, 33], "branch": [1, 7, 30], "break": [6, 16, 34], "brew": 5, "brief": [18, 28, 34], "briefli": 33, "bring": [2, 6, 7, 9, 15, 16, 21, 28, 31, 33, 34], "broad": [7, 9, 34], "broader": 34, "brought": [33, 34], "buffer": [2, 28], "bug": [1, 5, 34], "bui": 21, "build": [6, 28, 33, 34], "built": [7, 17, 20, 34], "busi": 33, "c": [1, 7, 8, 16, 17, 20, 26, 28, 31, 32, 33, 34], "c1": 20, "c10": [6, 17], "c620": 33, "cach": [2, 5, 7, 19, 20, 30, 34], "cache_weight_for_large_batch": 2, "caff": 3, "calcul": [1, 2, 8, 16, 21, 22], "cali_dataset": 34, "calib_dataload": [2, 6, 16, 34], "calib_dataset": [6, 29], "calib_evalu": 6, "calib_func": 2, "calib_sampl": 29, "calibr": [2, 13, 22, 26, 29, 30, 32, 34], "calibrated_model": 34, "calibration_data_load": [4, 6, 13], "calibration_data_set": [15, 34], "calibration_model": 29, "calibration_sampl": 6, "call": [2, 6, 8, 13, 17, 18, 21, 26, 32, 33, 34], "caller": [26, 34], "can": [1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 26, 28, 29, 30, 31, 32, 33, 34], "cannot": [8, 19, 26, 31, 34], "canon": 18, "capabl": [3, 17, 34], "capac": [21, 30], "captur": [4, 34], "card": 18, "care": 32, "carri": 30, "case": [2, 6, 7, 9, 12, 16, 17, 18, 28, 31, 33, 34], "cases": 32, "cast": [2, 8, 21, 28], "casual": 26, "cat": [8, 31, 32, 34], "catch": 6, "categor": 7, "categori": [8, 34], "caus": [2, 7, 21, 26, 28, 31, 33, 34], "causal": 2, "cc": [5, 6, 17], "ccl": [6, 31, 34], "cd": [5, 6], "cdist": 8, "center": 34, "cento": 30, "cerr": 6, "certain": [1, 7, 26, 28, 29, 31, 33], "ch_axi": 2, "chain": 21, "chang": [2, 5, 6, 7, 8, 10, 11, 12, 15, 17, 18, 20, 23, 25, 26, 29, 31], "changed_onli": 5, "changelog": 34, "channel": [2, 3, 10, 15, 16, 26, 34], "channels_last": [6, 7, 18, 23, 33, 34], "char": 6, "charact": 5, "chat": [28, 34], "chatglm": [2, 28], "chatglm2": [28, 34], "chatglm3": [28, 34], "cheat": 23, "check": [2, 5, 6, 7, 13, 18, 28, 29, 31, 34], "check_trac": [6, 13, 32], "checkpoint": [2, 6, 29], "checkpoints_json": 29, "chip": 33, "chipset": 33, "choic": [6, 21, 23, 31, 34], "choleski": 8, "cholesky_invers": 8, "cholesky_solv": 8, "choos": [6, 8, 20, 23, 31, 33, 34], "chosen": [8, 14, 17], "chunk": 34, "chw": 18, "chwn": 18, "ci": 5, "cifar10": [6, 13], "circumst": 8, "clamp": 13, "clang": 5, "class": [2, 5, 6, 7, 8, 10, 16, 20, 26, 34], "classif": [26, 30], "claus": [7, 10, 19], "clean": 5, "clear": 10, "clibrat": 34, "click": 3, "clone": 5, "close": [18, 31, 33], "cloud": 3, "clr": 19, "cmake": [5, 6, 17, 34], "cmake_minimum_requir": 6, "cmakefil": 17, "cmakelint": 5, "cmakelist": 6, "cnn": [7, 18, 26, 30, 33, 34], "co": [2, 34], "coco": 30, "code": [1, 2, 5, 6, 7, 10, 11, 12, 13, 18, 19, 21, 23, 24, 26, 27, 29, 33, 34], "codegen": [2, 28, 34], "codeless": 31, "codellama": 28, "codenam": 34, "collabor": 3, "collate_batch": 6, "collate_fn": 6, "collect": [6, 32, 33, 34], "column": 6, "com": [2, 5, 34], "combin": [2, 12, 14, 28, 31, 34], "come": 33, "comma": 33, "command": [4, 5, 6, 14, 23, 31, 32, 33, 34], "comment": [5, 14, 17, 22, 34], "commit": 5, "common": [17, 21, 28, 31, 33], "commonli": [7, 28, 33, 34], "commun": [6, 28, 31, 32, 33, 34], "communication_backend_nam": 29, "compact": [31, 32, 33], "compar": [1, 2, 7, 13, 18, 21, 26, 28, 30, 31, 33, 34], "compat": [17, 21], "compet": 33, "competit": 33, "compil": [1, 5, 6, 23, 26, 33, 34], "complet": [5, 6, 14, 18, 22, 29, 33], "complex": 17, "complexdoubl": 17, "complexfloat": 17, "complic": [26, 31, 33], "complier": 17, "compon": [15, 26, 27, 28], "compos": [6, 13], "comprehens": [1, 34], "compressor": [3, 7, 16, 22, 34], "compris": 18, "compuat": 13, "comput": [2, 6, 7, 13, 15, 16, 18, 20, 21, 28, 30, 31, 32, 33, 34], "concat": [2, 20, 26, 28, 34], "concat_fp32_from_bf16": 21, "concat_linear": 2, "concat_output": 2, "concaten": [2, 21], "concept": [18, 33], "concern": 7, "conclud": [30, 34], "conclus": 18, "concurr": [32, 33], "conda": [5, 33], "conda_prefix": [31, 32], "condit": 27, "conduct": 7, "conf": [4, 13, 14, 31, 34], "conf_fil": [14, 34], "confer": 3, "config": [2, 6, 11, 23, 31, 32], "configur": [2, 4, 6, 7, 14, 15, 16, 17, 31, 32, 34], "confirm": 31, "conflict": [7, 17], "connect": 33, "consecut": 33, "consider": 16, "consist": [16, 28, 33, 34], "const": [6, 17], "constant": 13, "constraint": [2, 34], "construct": [2, 7, 13], "consum": [7, 14], "consumpt": 34, "contain": [2, 5, 6, 13, 17, 26, 31, 32, 33, 34], "containeraliasingtest": 5, "content": [29, 34], "context": [2, 5, 6, 8, 20, 28, 33, 34], "context_len": 2, "contigu": [6, 13, 18, 33, 34], "contiguous_format": [18, 33], "continu": [31, 32, 34], "contribut": [28, 31, 34], "control": [1, 2, 7, 20, 26, 31, 33, 34], "conv": [2, 8, 10, 13, 15, 20, 26, 34], "conv1d": [8, 13], "conv2": 20, "conv2d": [2, 7, 8, 10, 13, 18, 20, 26, 34], "conv3d": [8, 13, 34], "conv_bn": 2, "conv_bn_fold": [2, 26, 34], "conv_tbc": 8, "conv_transpose1d": 8, "conv_transpose2d": 8, "conv_transpose3d": 8, "conveni": [8, 34], "convers": [2, 8, 13, 34], "convert": [1, 2, 4, 6, 7, 8, 9, 10, 13, 16, 17, 18, 20, 23, 26, 32, 34], "convert_model": [4, 13, 15, 16], "converted_model": [4, 6, 26, 34], "convolut": [2, 6, 7, 13, 20, 33, 34], "convolution1d": 34, "convolutuon": 2, "convrelu": 13, "convsumrelu": 13, "convtranspose2d": [2, 13], "convtranspose3d": 13, "coo": 18, "cooper": [7, 30, 34], "copi": [5, 17, 18], "copyright": [17, 27], "core": [2, 7, 14, 17, 30, 33, 34], "core_id": [2, 20, 31], "correct": [7, 18, 25, 34], "correspond": [20, 31, 34], "cosine_embedding_loss": 8, "cost": [2, 6, 28, 30, 33], "costli": 33, "could": [7, 13, 16, 18, 26, 32, 33, 34], "count": 31, "counterpart": [2, 7, 18, 34], "coupl": [20, 33, 34], "cout": 6, "cover": [13, 18, 31], "cpp": [5, 6, 33], "cppsdk": 34, "cpu": [1, 3, 4, 5, 6, 7, 8, 10, 14, 15, 16, 19, 20, 23, 25, 26, 28, 30, 31, 32, 34], "cpu_capability_avx512": 17, "cpu_capability_avx512_bf16": 17, "cpu_featur": 17, "cpu_feature_main": 17, "cpu_launcher_arg": 32, "cpu_launcher_en": 32, "cpu_pool": [2, 20, 34], "cpu_pool1": 20, "cpu_pool2": 20, "cpuid": 17, "cpuinfo": 17, "cpunodebind": 33, "cpupool": [2, 20, 34], "crash": [31, 33, 34], "creat": [7, 16, 20, 33, 34], "creation": 2, "creator": 34, "credit": 17, "criteria": 16, "criterion": [6, 8, 16, 22], "cross": [32, 33, 34], "cross_entropy_loss": 8, "crossentropyloss": [6, 16], "csrc": 26, "csv": 14, "ctc_loss": 8, "cu": 5, "cu_seqlens_kv": 2, "cu_seqlens_q": 2, "cudnn": 18, "current": [1, 2, 5, 7, 11, 13, 14, 15, 16, 17, 19, 20, 26, 28, 29, 34], "current_posit": 2, "custom": [1, 2, 7, 26, 34], "customized_forward": 10, "cv": 34, "cvt_fp32_to_bf16": 17, "cvt_fp32_to_bf16_kernel_fn": 17, "cvt_fp32_to_bf16_kernel_impl": 17, "cvt_fp32_to_bf16_kernel_stub": 17, "cvtfp32tobf16": 17, "cvtfp32tobf16krnl": 17, "cxx": [6, 17], "cxx11": 34, "cxx_standard": 6, "d": [4, 5, 6, 7, 8, 13, 26, 28, 34], "d8": 33, "d__avx512f__": 17, "d__avx__": 17, "dag": 13, "daili": 34, "data": [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 23, 26, 31, 32, 34], "data_typ": 18, "databrick": 28, "dataload": [2, 6, 10, 13, 16, 20, 22, 29, 34], "dataset": [6, 13, 16, 29, 30, 33, 34], "dataset_nam": [10, 34], "datatyp": [20, 34], "date": 34, "dcmake_prefix_path": 6, "dcpmm": 30, "dcpu_cap": 17, "dcpu_capability_amx": 17, "dcpu_capability_avx2": 17, "dcpu_capability_avx512": 17, "dcpu_capability_avx512_bf16": 17, "dcpu_capability_avx512_fp16": 17, "dcpu_capability_avx512_vnni": 17, "dcpu_capability_default": 17, "ddp": [2, 6], "ddr": 30, "ddr4": 33, "dealloc": 33, "debug": [2, 31], "debug_squad": [10, 34], "dec": 3, "decai": 7, "decid": [2, 15, 20, 28], "decim": 21, "declar": 17, "decltyp": 17, "decod": [2, 28, 30, 34], "deconv3d": 34, "decor": 2, "dedic": [2, 6, 28, 34], "deduct": 31, "deep": [3, 7, 8, 11, 13, 14, 21, 33], "deepcopi": 2, "deepspe": [2, 34], "def": [2, 6, 8, 10, 16, 20, 26, 34], "default": [2, 4, 6, 7, 10, 12, 13, 15, 16, 17, 20, 22, 23, 26, 28, 30, 32, 33, 34], "default_dynamic_qconfig": [15, 32], "default_dynamic_qconfig_map": 6, "default_dynamic_qconfigprepared_model": 4, "default_static_qconfig": [13, 15, 32, 34], "default_static_qconfig_map": 6, "default_static_qconfigprepared_model": 4, "defin": [2, 5, 6, 7, 8, 10, 16, 17, 18, 22, 32], "definit": [17, 21, 34], "degre": 34, "deinit": 5, "deliv": [7, 28, 34], "demand": [2, 7], "demonstr": [6, 18, 26, 32], "demostr": 23, "denomin": 2, "denot": 21, "dens": [7, 18], "dep": 34, "depend": [5, 7, 17, 18, 25, 26, 33, 34], "deploi": 34, "deploy": [2, 7, 13, 34], "deployment_mod": [2, 6, 23], "deprec": [3, 26], "dequant": [13, 16], "desc": 18, "describ": [8, 13, 18, 21, 32, 33], "descript": [4, 7, 16, 18, 20, 25, 33, 34], "descriptor": 34, "design": [2, 5, 8, 18, 21, 29, 34], "desir": [16, 31], "destroy_process_group": 6, "destruct": 33, "detail": [2, 5, 6, 7, 8, 9, 11, 13, 17, 18, 24, 25, 26, 28, 30, 32, 33, 34], "detect": [1, 6, 12, 17, 26, 33, 34], "detectron2": 18, "determin": [2, 6, 17, 21, 33], "develop": [1, 3, 6, 28, 30, 33, 34], "devic": [1, 2, 15, 29, 31, 34], "device_nam": [7, 8], "diagram": [18, 33], "dict": [2, 6, 23], "dictionari": 34, "did": [33, 34], "didn": 20, "differ": [1, 2, 7, 15, 16, 17, 18, 20, 28, 31, 32, 33, 34], "difficult": 18, "difficulti": 16, "diffus": [3, 34], "digit": 21, "dim": [2, 6, 18, 23], "dimens": [2, 18, 26], "dimm": 34, "dinner": [6, 23], "dir": [17, 31], "direct": [2, 5, 13], "directli": [2, 6, 33, 34], "directori": [1, 5, 6, 14, 29, 31, 32], "dirty_decay_m": [31, 33], "disabl": [2, 6, 7, 13, 26, 31, 33, 34], "disable_auto_channels_last": 9, "disable_iomp": [14, 32], "disable_numactl": [14, 32], "disadvantag": 21, "discret": 1, "discrete gpu": 1, "discuss": [5, 18, 33], "dispatch": [1, 34], "dist": 6, "dist_sampl": 6, "distilbert": 30, "distribut": [2, 3, 7, 16, 31, 32, 33, 34], "distributeddataparallel": [6, 34], "distributedsampl": 6, "div": 13, "divid": [2, 13, 31, 32, 33, 34], "divis": [2, 20], "divisor": [2, 20], "dl": [3, 7, 34], "dlopen": 20, "dlrm": [3, 7, 26, 30, 34], "dnnl": 30, "dnnl_verbos": 2, "do": [2, 5, 8, 16, 18, 20, 21, 26, 28, 30, 31, 32, 33, 34], "do_ev": [10, 34], "do_sampl": [6, 23], "doc": [1, 2, 5, 11, 29, 34], "doc_strid": [10, 34], "docker": [30, 34], "dockerfil": 34, "dockerhub": 34, "docstr": 5, "document": [0, 7, 17, 20, 29, 34], "doe": [2, 7, 13, 18, 20, 26, 34], "doesn": [2, 15, 16, 18, 26, 34], "dolli": [28, 34], "domin": [1, 7, 28], "don": [2, 5, 8, 14, 17, 34], "done": [6, 10, 16, 17, 26, 33, 34], "dot": [2, 7, 18, 28], "doubl": 17, "down": [5, 32, 34], "download": [6, 13, 16], "downstream": 8, "dpc": 1, "dpcpp": 34, "dram": 2, "dramat": [32, 33], "drawback": [2, 21], "drive": [1, 7, 28], "driven": 2, "drop": [31, 32], "dropout": [2, 10], "dst": 17, "dtype": [2, 4, 6, 7, 8, 10, 11, 13, 15, 16, 17, 23, 26, 29, 31, 34], "due": [1, 8, 10, 17, 20, 26], "dummi": 32, "dummy_tensor": 32, "dummymodul": 10, "dump": [2, 31], "durat": [2, 21], "dure": [4, 6, 7, 10, 13, 16, 21, 31, 33, 34], "dynam": [1, 4, 20, 28, 32, 33, 34], "dynamic_qconfig": 15, "dynamic_quantized_model": 6, "e": [1, 2, 6, 7, 8, 12, 16, 17, 18, 28, 31, 33, 34], "each": [2, 8, 14, 16, 17, 19, 20, 21, 31, 32, 33, 34], "eager": [1, 7, 11, 12, 23, 32, 34], "earli": [2, 34], "earlier": 21, "eas": [7, 18, 34], "easi": [1, 3, 21], "easier": [2, 18, 21], "easili": [10, 15], "ec2": 32, "edit": [5, 26, 34], "effect": [2, 17, 21, 26, 32, 33], "effici": [1, 7, 11, 19, 20, 28, 31, 33, 34], "effort": 34, "eig": 8, "einsum": 34, "either": [2, 26, 31], "el8_4": 30, "elaps": 33, "element": [2, 18, 19], "eleutherai": [2, 28], "elif": 6, "elimin": 28, "els": [6, 14, 17, 18, 23], "elser": 34, "eltwis": 34, "elu": 13, "emb": 7, "emb1": 7, "emb2": 7, "emb3": 7, "emb_m": 7, "embed": [2, 7, 28, 34], "embedding_bag": 10, "embedding_spec": 7, "embeddingbad": 34, "embeddingbag": [7, 26, 34], "embeddingspec": 7, "embedingbag": 7, "emblist": 7, "emerg": [1, 7, 28], "emphas": 33, "emply_lik": 2, "empow": 3, "empti": [18, 31], "enabl": [1, 2, 3, 4, 6, 7, 8, 10, 13, 16, 18, 20, 22, 23, 26, 28, 31, 32, 33, 34], "enable_auto_channels_last": 9, "enable_auto_mix_precis": 34, "enable_auto_mixed_precis": 34, "enable_auto_optim": 34, "enable_blockwise_loss": [16, 22], "enable_jemalloc": 32, "enable_onednn_fus": [2, 13], "enable_tcmalloc": 32, "encod": 34, "encount": [26, 34], "encourag": 34, "end": [6, 13, 20, 34], "endif": 17, "endl": 6, "engin": [1, 6, 18, 33], "enhanc": [1, 3, 28, 34], "enough": [2, 7, 19], "ensur": [11, 19, 20, 32, 34], "entir": [2, 16, 28], "enumer": [6, 13, 16, 29], "env": [6, 29], "env_key1": 5, "env_key2": 5, "env_val1": 5, "env_val2": 5, "environ": [2, 5, 6, 17, 20, 24, 28, 30, 31, 32, 33], "ep": [2, 7, 10, 19], "epoch": 16, "equal": [2, 15, 20, 31, 32, 33], "equip": 33, "equival": 34, "error": [2, 5, 6, 7, 10, 16, 18, 21, 22, 26, 34], "especi": [2, 5, 28, 34], "etc": [2, 5, 6, 17, 34], "eval": [2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "eval_func": [2, 16, 34], "eval_funct": 4, "evalu": [2, 16, 34], "even": [2, 5, 7, 33, 34], "evenli": 31, "everi": [2, 28], "exact": 2, "exactli": 21, "exampl": [2, 5, 7, 8, 13, 18, 19, 21, 22, 23, 24, 25, 28, 29, 32, 33, 34], "example_input": [2, 4, 6, 13, 15, 29, 32, 34], "example_kwarg_input": 2, "examplenet": 20, "examplenet1": 20, "examplenet2": 20, "exce": [26, 30, 33, 34], "except": [28, 31, 34], "excess": 34, "excit": 34, "exclus": 31, "execut": [2, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 19, 20, 26, 31, 32, 33, 34], "exetens": 2, "exhibit": 30, "exist": [1, 5, 7, 13, 26, 31, 33], "exit": [6, 31], "exp": 13, "expect": [2, 7, 30, 34], "expecttest": 5, "expens": 18, "experi": [5, 7, 10, 12, 16, 18, 26, 33, 34], "experiment": 34, "explain": [17, 18, 21], "explicit": [18, 20, 33], "explicitli": [2, 8, 16, 20, 26, 31, 34], "explor": 2, "expon": 21, "export": [4, 31, 33], "expos": 8, "express": [18, 34], "ext": [6, 34], "extend": [1, 18, 25, 33, 34], "extens": [2, 3, 4, 6, 9, 10, 13, 14, 16, 17, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34], "extra": [2, 5, 10, 20, 31, 32], "extra_rope_config": 2, "extrem": [7, 14, 33], "f": [5, 6, 13, 16, 28, 34], "f1": 30, "f16c": 17, "f32": [17, 18], "f401": [6, 11, 12, 13, 16, 23, 29], "face": 3, "facebook": [3, 6, 28], "facilit": 34, "fact": [18, 33], "factor": [2, 6, 16, 31], "fail": [10, 26, 34], "failur": [12, 34], "fake": 2, "fake_quantize_per_tensor_affin": 8, "falcon": [2, 28, 34], "fall": [6, 12], "fals": [2, 4, 6, 7, 8, 13, 14, 15, 16, 17, 20, 22, 23, 26, 31, 32, 34], "famili": [2, 28, 33, 34], "fashionmnist": 16, "fast": [4, 12, 33, 34], "fast_bert": [2, 4, 6, 7, 11, 34], "fast_layer_norm": [2, 34], "faster": [2, 6, 7, 8, 30, 33], "fastest": 17, "fastlayernorm": [2, 34], "fatal_error": 6, "fault": 34, "favorit": 31, "fb": 34, "feasibl": 10, "featur": [0, 1, 2, 3, 5, 8, 10, 13, 14, 18, 20, 23, 25, 26, 28, 30, 31, 32, 33, 34], "feb": 3, "feed": [2, 9, 18], "feedback": 34, "feedforward": 28, "feel": [5, 18, 34], "few": [5, 7, 9, 13, 16, 18, 32, 34], "fewer": 21, "fft_fft": 8, "fft_fft2": 8, "fft_fftn": 8, "fft_hfft": 8, "fft_ifft": 8, "fft_ifft2": 8, "fft_ifftn": 8, "fft_ihfft": 8, "fft_irfft": 8, "fft_irfft2": 8, "fft_irfftn": 8, "fft_rfft": 8, "fft_rfft2": 8, "fft_rfftn": 8, "figur": [1, 2, 21, 28, 33], "file": [2, 4, 5, 6, 8, 14, 15, 16, 17, 18, 31, 34], "filenam": 5, "find": [1, 2, 7, 14, 16, 23, 26, 30, 31, 34], "find_packag": 6, "findavx": 17, "fine": [3, 20, 29, 31, 32, 33, 34], "finer": [1, 7, 20], "finish": [6, 11, 12, 13, 16, 20], "first": [2, 3, 5, 6, 7, 9, 10, 12, 16, 19, 20, 21, 26, 31, 32, 33], "firstli": [2, 28], "fit": [5, 7, 33, 34], "fix": [2, 5, 7, 34], "flag": [2, 5, 7, 17, 20, 31, 34], "flake8": 5, "flan": 28, "flash": 34, "flash_atten_varlen": 2, "flatten": [16, 20], "flexibl": 34, "float": [2, 6, 7, 8, 14, 15, 16, 17, 21, 29, 34], "float16": [2, 8], "float32": [2, 13, 21, 23, 26, 30, 31, 34], "float64": 8, "flourish": 28, "flow": 26, "flush": [6, 23], "fma": 17, "fn_type": 17, "focu": [2, 10, 18, 29, 34], "focus": [13, 34], "fold": [2, 10, 15, 16, 26, 34], "folder": 5, "follow": [1, 2, 4, 5, 6, 7, 8, 11, 14, 15, 16, 17, 18, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34], "footbal": 7, "footprint": [7, 21, 28, 34], "forg": 33, "fork": [17, 33], "format": [2, 5, 6, 7, 9, 14, 22, 26, 28, 31, 33, 34], "format_tag": 18, "former": 6, "formerli": [30, 33, 34], "formula": 21, "forward": [2, 6, 8, 13, 16, 20, 21, 26, 32, 33, 34], "found": [1, 6, 7, 14, 16, 18, 29, 31, 32, 33, 34], "foundat": [18, 33], "fp16": [2, 6, 17, 29], "fp32": [2, 4, 16, 17, 19, 21, 23, 28, 34], "fp32_gw": 21, "fp32_w": 21, "fpn": 30, "fraction": 21, "fractional_max_pool2d": 8, "fractional_max_pool3d": 8, "fragment": 33, "framework": [5, 34], "free": [31, 34], "freez": [6, 8, 10, 13, 15, 16, 20, 23, 26, 32, 34], "freezed_model": [26, 34], "frequenc": [2, 30], "frequent": 7, "friendli": [7, 33], "from": [1, 2, 3, 4, 5, 8, 10, 11, 13, 15, 16, 17, 18, 19, 20, 21, 23, 25, 28, 29, 31, 32, 33, 34], "from_embeddingbag_list": 7, "from_pretrain": [4, 6, 11, 23, 29, 32], "front": [13, 34], "frontend": [1, 2, 7, 20, 28, 34], "frozenbatchnorm": 34, "frozenbatchnorm2d": 7, "fsi": 34, "fulfil": 20, "full": [2, 5, 18, 32, 33, 34], "fulli": [5, 15, 17, 21, 31, 33, 34], "function": [2, 5, 6, 7, 8, 10, 11, 12, 14, 15, 17, 20, 21, 23, 26, 28, 29, 31, 33, 34], "further": [1, 2, 5, 6, 7, 18, 20, 28, 33, 34], "fuse": [2, 7, 13, 16, 19, 28, 34], "fuse_update_step": 2, "fusion": [1, 2, 7, 10, 21, 28, 34], "futur": [7, 28, 34], "futuretensor": 20, "fx": [3, 7, 10, 26, 34], "g": [2, 7, 8, 16, 17, 18, 28, 34], "gain": [1, 7, 26, 28, 34], "game": 7, "gave": 14, "gb": 20, "gcc": 17, "gcp": 3, "gelu": [2, 13, 34], "gemm": [7, 18, 26, 28, 34], "gen": [3, 30, 34], "gen_": 2, "gen_id": [6, 23], "gen_text": [6, 23], "genai": [1, 7, 28], "gender": 7, "gener": [1, 5, 6, 7, 10, 12, 16, 17, 18, 21, 23, 28, 29, 30, 31, 32, 33, 34], "generate_kwarg": [6, 23], "genv": 31, "geomean": 34, "geqrf": 8, "get": [1, 2, 3, 4, 6, 7, 10, 11, 15, 17, 20, 21, 22, 26, 28, 29, 30, 31, 33, 34], "get_acceler": 29, "get_core_list_of_node_id": 2, "get_cpp_typesize_and_vecs": 17, "get_cpp_typesize_and_vecsize_kernel_fn": 17, "get_cpp_typesize_and_vecsize_kernel_impl": 17, "get_cpp_typesize_and_vecsize_kernel_stub": 17, "get_smooth_quant_qconfig_map": [2, 6, 29], "get_weight_only_quant_qconfig_map": [2, 6, 29], "getattr": [6, 23], "getveclength": 17, "getveclengthkrnl": 17, "gif": 31, "gil": 20, "git": [2, 5, 28], "github": [1, 2, 5, 6, 7, 8, 34], "give": [32, 34], "given": [2, 6, 13, 14, 16, 28], "global": [2, 20, 22, 34], "global_past_key_valu": 6, "gnu": [6, 17, 32], "go": [2, 5, 8], "gomp_cpu_affin": 33, "good": [1, 2, 5, 7, 12, 18, 19, 28, 33, 34], "googl": [3, 5, 28], "gperftool": 33, "gpertool": 33, "gpt": [2, 28, 30], "gpt2": 26, "gptbigcod": [2, 28], "gptj": 2, "gptjforcausallm": 2, "gptq": [2, 6, 34], "gpu": [1, 3, 18, 34], "grad": [7, 19], "grad0": 19, "grad1": 19, "grad_i": 19, "grad_n": 19, "gradient": 7, "grain": [1, 3, 7, 20], "granular": [2, 31, 32, 33], "graph": [1, 4, 8, 10, 16, 23, 26, 31, 34], "graph_for": 13, "graph_mod": [2, 4, 7, 12, 34], "graphic": 33, "great": 33, "greater": 2, "greedi": [6, 23], "grid": 14, "grid_sampl": 8, "grokk": 3, "ground": 21, "group": [2, 19, 20, 33], "group_norm": 8, "group_siz": 2, "gru": 15, "grucel": 15, "gt": [4, 14, 33], "gtest_filt": 5, "guid": [3, 6, 7, 17, 32, 34], "guidanc": 7, "guidelin": 18, "gw": 21, "h": [5, 6, 7, 16, 18, 26, 31, 32], "ha": [0, 1, 2, 7, 10, 14, 17, 18, 20, 21, 26, 28, 30, 31, 33, 34], "had": [6, 33], "half": [2, 7, 17, 21], "halv": 21, "handl": [6, 18, 33], "handler": 32, "hang": [33, 34], "happen": 7, "hard": [18, 26], "hardsigmoid": 34, "hardswish": [13, 34], "hardtanh": 13, "hardwar": [1, 3, 17, 25, 28, 32, 34], "hav": 17, "have": [1, 2, 5, 6, 7, 9, 14, 17, 18, 20, 21, 23, 26, 27, 28, 30, 31, 32, 33, 34], "head": [2, 34], "head_dim": 2, "head_map": 2, "head_mask": 2, "head_num": 2, "head_siz": 2, "header": 17, "heavi": 7, "heavier": 28, "height": 18, "hello": 5, "help": [2, 5, 6, 17, 23, 28, 31, 33, 34], "helper": 2, "here": [5, 8, 10, 13, 16, 17, 18, 20, 26, 32, 33, 34], "herebi": 16, "hero": 34, "heterogen": 34, "heurist": [2, 20, 34], "hf": [6, 28], "hf_beam_sampl": 34, "hf_beam_search": 34, "hf_greedy_search": 34, "hf_sampl": 34, "hidden": [2, 18, 28], "hidden_s": [2, 6], "hidden_st": 2, "high": [19, 21, 33], "higher": [2, 7, 13, 17, 18, 28], "higher_is_bett": 14, "highli": [7, 23, 28, 33, 34], "hinge_embedding_loss": 8, "hint": [2, 20], "histogram": [30, 34], "histogramobserv": [2, 15], "histori": [2, 14, 28], "hobbi": 7, "hold": [18, 33], "home": [31, 32], "homebrew": 5, "hood": 34, "hook": [10, 16], "hopefulli": 7, "host": [30, 34], "hostfil": 31, "hostnam": 31, "hotspot": 28, "how": [1, 2, 10, 15, 17, 18, 23, 28, 31, 32, 33, 34], "howev": [2, 5, 7, 8, 9, 16, 20, 26, 28, 31, 33, 34], "hp": 14, "hpc": 11, "html": [2, 5, 16], "http": [2, 5, 16, 34], "hub": 28, "huber_loss": 8, "hug": 3, "huge": [7, 14, 33], "hugginfac": 34, "huggingfac": [2, 6, 26, 28, 32, 34], "huggingface_transform": 32, "hurt": 20, "hw": 18, "hwc": 18, "hwio": 18, "hwn": 18, "hydra": 31, "hyper": [2, 30, 33, 34], "hyperparam": 14, "hyperparamet": [4, 7], "hyperparamt": 14, "hyperthread": 32, "hypertun": [4, 34], "hypertune_directori": 14, "hypervisor": 34, "hypothesi": 5, "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 26, 27, 28, 29, 30, 32, 33, 34], "i_mpi_pin_domain": 31, "iakv": [2, 28], "ic": 2, "ic_block": 2, "id": [2, 31, 32], "idea": [11, 21, 33], "ideep": [17, 18], "ident": [2, 10, 18], "identif": [6, 17], "identifi": 34, "idx": [2, 28, 31], "ieityuan": 28, "illeg": 34, "illustr": [18, 19, 21, 31, 33], "imag": [8, 13, 18, 33, 34], "image_classifi": 32, "imagenet": [18, 30], "immedi": 7, "immintrin": 17, "impact": [2, 7, 20], "imper": [20, 34], "impl": 17, "implement": [1, 5, 7, 11, 19, 26, 28, 33, 34], "implicit": 18, "implicitli": 6, "import": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 15, 16, 17, 18, 20, 21, 23, 25, 26, 28, 29, 32, 33, 34], "impract": [7, 14], "improv": [1, 3, 7, 8, 13, 20, 22, 28, 30, 32, 33], "in1": 7, "in2": 7, "in3": 7, "in_i": 7, "in_m": 7, "inaccur": 21, "inc": [16, 17, 22, 28], "includ": [1, 2, 5, 6, 7, 10, 14, 15, 17, 23, 26, 27, 28, 30, 34], "inclus": 33, "incorrect": [12, 26, 34], "increas": [1, 2, 3, 21, 26, 28, 30, 33, 34], "independ": 31, "index": [2, 5, 18, 28, 33], "index_copi": 8, "index_to_nam": 32, "indic": [2, 6, 18, 28], "indirect": 2, "indirect_access_kv_cache_attent": [2, 34], "indirectaccesskvcacheattent": [2, 34], "individu": [5, 30], "inductor": [7, 34], "inevit": 10, "inf": 14, "infer": [2, 3, 4, 7, 10, 11, 12, 15, 18, 20, 21, 23, 26, 30, 33, 34], "inferenc": 2, "inference2": 30, "inference3": 30, "inference_mod": [6, 23, 29], "influenc": [31, 33], "info": [2, 6, 17, 26, 31, 32, 34], "inform": [1, 2, 6, 7, 14, 17, 18, 28, 31, 32, 33, 34], "ingredi": 18, "init": [2, 5, 15, 34], "init_alpha": [16, 22], "init_distribut": 29, "init_infer": 29, "init_method": 6, "init_process_group": 6, "initi": [2, 20, 32], "inject": 34, "inlin": 17, "inplac": [2, 4, 6, 13, 15, 18, 23, 32], "input": [2, 6, 7, 9, 10, 13, 15, 16, 17, 18, 22, 23, 26, 29, 30, 32, 33, 34], "input1": 10, "input_channel": 2, "input_hint": 20, "input_id": [6, 23], "input_ids_pad": 6, "input_s": [6, 23], "input_split_hint": [2, 20], "input_tokens_length": [6, 23], "inputpath": 32, "insert": [2, 16], "insid": [2, 5, 20, 31], "inspir": 34, "instal": [4, 5, 6, 23, 25, 26, 28, 33, 34], "instanc": [2, 7, 10, 14, 32, 34], "instance_idx": 31, "instancenorm": 34, "instanti": 6, "instead": [7, 8, 14, 19, 20, 29, 30, 31, 32, 33, 34], "instruct": [1, 2, 5, 6, 7, 8, 17, 21, 23, 24, 25, 28, 30, 33, 34], "int": [2, 6, 7, 14, 17, 23, 26, 29, 31, 34], "int4": [2, 28, 29, 34], "int8": [1, 2, 3, 4, 17, 18, 20, 22, 28, 29, 34], "int8_qconfig": 6, "integ": [28, 31, 33], "integr": [7, 18, 28, 33, 34], "intel": [2, 3, 4, 7, 8, 9, 10, 11, 13, 14, 16, 17, 20, 21, 22, 23, 25, 26, 27, 28, 29, 34], "intel discrete gpu": 1, "intel optim": 1, "intel_extension_for_pytorch": [1, 2, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 20, 23, 25, 29, 32, 34], "intel_pytorch_extens": [7, 25, 26, 34], "intel\u00ae extension for pytorch*": 1, "intend": 5, "intent": 5, "interact": [7, 34], "interconnect": 33, "interest": 5, "interfac": [5, 6, 18, 26, 28], "intern": [17, 18, 20, 32], "interpret": 31, "interrupt": 32, "intervent": 8, "intra": 2, "intrins": 17, "introduc": [1, 3, 7, 15, 18, 21, 22, 31, 33, 34], "introduct": [0, 2, 7, 28, 33, 34], "invalid": 33, "invers": 8, "investig": [2, 31], "invoc": [1, 7], "invok": [2, 6, 8, 10, 13, 20, 23, 26, 29, 34], "involv": 21, "io": 28, "iostream": 6, "ip": 31, "ipex": [1, 2, 3, 4, 6, 7, 9, 11, 12, 13, 15, 16, 17, 19, 20, 23, 26, 29, 31, 32, 34], "ipex_declare_dispatch": 17, "ipex_define_dispatch": 17, "ipex_en": 32, "ipex_fus": 2, "ipex_register_dispatch": 17, "ipexconfig": 6, "ipexrun": [4, 10, 31, 34], "is_caus": 2, "is_contigu": 18, "is_cus": 2, "is_dynam": [6, 15], "is_hyperthreading_en": 14, "is_runtime_ext_en": 2, "isa": [1, 34], "isa_codegen": 17, "isa_nam": 17, "isacodegen": 17, "issu": [1, 2, 5, 8, 21, 26, 33], "ital": 32, "item": 16, "iter": [2, 16, 21, 28, 34], "its": [2, 6, 7, 8, 14, 17, 21, 28, 30, 31, 32, 33, 34], "itself": [2, 5, 18], "ivalu": 6, "j": [2, 5, 17, 28, 30], "jan": 3, "je": 14, "jemalloc": [30, 32, 34], "jemallocl": 31, "jit": [1, 2, 5, 6, 7, 8, 13, 15, 16, 18, 20, 23, 26, 32, 34], "job": 5, "join": 33, "joint": 34, "joint_net": [26, 34], "json": [2, 6, 15, 16, 32, 34], "jul": 3, "jun": 3, "jupyt": 5, "just": [2, 14, 29, 34], "k": [2, 5], "kcpu": 17, "keep": [5, 12, 18, 21, 28, 32, 33, 34], "kei": [2, 7, 28, 34], "kept": 21, "kernel": [1, 2, 7, 20, 26, 28, 30, 33, 34], "kernel_s": 10, "key_cach": 2, "key_token": 2, "keystrok": 5, "keytensor": 2, "keyword": 2, "kill": 32, "kind": 7, "kineto_librari": 6, "kl_div": 8, "kmp": [31, 33], "kmp_": 20, "kmp_affin": [31, 32, 33], "kmp_blocktim": [31, 32, 33], "knob": [2, 4, 12, 31], "know": 5, "knowledg": 33, "known": [6, 10, 28], "kt": 3, "kv": 2, "kv_cach": [2, 28], "kwarg": [2, 29], "l1318": 2, "l1_loss": 8, "l2": 33, "l23": 2, "l4": 2, "l50": 2, "l76": 2, "label": 8, "lake": [7, 30, 34], "lamb": [19, 21], "land": [7, 34], "landscap": [1, 7, 28], "languag": [1, 2, 23, 24, 25, 26, 29, 34], "lar": 34, "larg": [1, 2, 19, 23, 24, 25, 26, 29, 30, 33, 34], "larger": [2, 20, 30, 31, 33, 34], "last": [3, 10, 21, 26, 34], "last_ind": 6, "latenc": [3, 14, 18, 28, 30, 32, 34], "later": [2, 7, 25, 33], "latest": [1, 2, 25, 28, 30, 34], "launch": [4, 6, 20, 32, 34], "launcher": [7, 13, 31, 33, 34], "law": 7, "layer": [2, 16, 20, 22, 28, 34], "layer_past": 2, "layernorm": [2, 13, 16, 22, 34], "layernorm_modul": 2, "layout": [2, 26, 34], "lazi": 5, "ld": 31, "ld_preload": [20, 31, 32, 33], "ldd": 6, "lead": 28, "leaki": 13, "leaky_relu": 13, "leakyrelu": 34, "learn": [3, 7, 8, 11, 13, 14, 21, 31, 33], "learning_r": [10, 34], "leav": [2, 20, 33], "left": [21, 28, 32], "legal": 34, "legend": 28, "len": [2, 6, 7, 13, 16, 17], "length": [2, 5, 14, 21, 26, 30, 34], "less": [2, 8, 18, 20, 26, 34], "let": [5, 10, 18, 19, 20, 21], "level": [7, 10, 13, 16, 18, 20, 21, 26, 33, 34], "leverag": [1, 7, 11, 28, 32, 34], "lib": [6, 31, 32], "lib64": [31, 32], "libc10": 6, "libdnnl_graph": 6, "libgomp": 33, "libintel": [6, 34], "libiomp": 33, "libiomp5": [20, 31, 32, 33], "libjemalloc": 31, "libpytorch_path": 6, "librari": [1, 2, 5, 6, 7, 17, 20, 32, 33, 34], "libtcmalloc": [31, 32], "libtorch": [6, 34], "libtorch_cpu": 6, "libxsmm": 2, "licens": 17, "lighter": 8, "lightweight": 34, "like": [1, 2, 3, 5, 6, 7, 8, 14, 18, 19, 21, 26, 28, 31, 33, 34], "limit": [5, 8, 10, 20, 26, 32, 33, 34], "linalg_choleski": 8, "linalg_cholesky_ex": 8, "linalg_cond": 8, "linalg_eig": 8, "linalg_eigh": 8, "linalg_eigv": 8, "linalg_eigvalsh": 8, "linalg_householder_product": 8, "linalg_inv": 8, "linalg_inv_ex": 8, "linalg_lstsq": 8, "linalg_matrix_rank": 8, "linalg_qr": 8, "linalg_solv": 8, "linalg_svd": 8, "linalg_svdv": 8, "linalg_tensorinv": 8, "linalg_tensorsolv": 8, "line": [5, 10, 13, 18, 31, 32, 33], "linear": [2, 6, 7, 8, 13, 15, 16, 18, 26, 33, 34], "linear2silumul": [2, 34], "linear_": 2, "linear_bn": 2, "linear_bn_fold": 2, "linear_m": 2, "linear_m_modul": 2, "linear_modul": 2, "linear_relu_stack": 16, "linear_s_modul": 2, "linearadd": [2, 34], "linearaddadd": [2, 34], "lineargelu": [2, 34], "linearize_indices_and_offset": 7, "linearmul": [2, 34], "linearnewgelu": [2, 34], "linearrelu": [2, 34], "linearsilu": [2, 34], "linearsilumul": [2, 34], "link": [1, 6, 17, 34], "linux": [5, 6, 17, 30, 31, 33], "list": [2, 5, 7, 8, 13, 14, 16, 18, 25, 29, 31, 32, 33, 34], "liuhaotian": 28, "live": 5, "ll": [5, 32, 33], "llama": [2, 3, 6, 28, 34], "llama2": [30, 34], "llama3": 34, "llava": [2, 28], "llm": [1, 16, 22, 24, 25, 34], "load": [1, 2, 6, 7, 13, 15, 16, 17, 23, 29, 32, 34], "load_dataset": 6, "load_qconf_summari": 15, "load_state_dict": [2, 34], "loader": 16, "local": [6, 20, 28, 31, 32, 33], "locat": [5, 17, 34], "log": [4, 6, 13, 31, 32, 34], "logic": [2, 14, 18, 32, 33], "login": 6, "logit": 16, "long": [2, 6, 18, 21, 26, 28, 34], "long_factor": 2, "longer": [26, 30, 34], "longform": 26, "look": [5, 14, 16, 18], "loop": [5, 21, 29], "lose": 21, "loss": [2, 5, 6, 8, 16, 18, 21, 26], "loss_fn": 16, "lot": [28, 34], "low": [3, 4, 6, 7, 21, 23, 31, 33, 34], "low_cpu_mem_usag": [6, 23], "low_precision_checkpoint": [2, 6, 29], "lower": [2, 8, 17, 21, 28, 34], "lowest": 2, "lowp": [2, 6], "lowp_mod": [2, 6, 29], "lr": [6, 7, 8, 16, 19], "lr_decai": 19, "lsb": 17, "lscpu": 33, "lstm": [2, 10, 15, 34], "lstmcell": 15, "lstsq": 8, "lt": [4, 30], "lu_solv": 8, "m": [4, 14, 20, 26, 31, 32, 33, 34], "m6i": [30, 32], "m7i": 30, "machin": [3, 5, 6, 7, 14, 17, 26, 31, 32, 33, 34], "maco": 5, "macro": 17, "made": [5, 34], "mai": [1, 2, 3, 5, 6, 7, 8, 9, 16, 17, 18, 20, 26, 31, 32, 33, 34], "main": [1, 2, 5, 6, 14, 20, 31, 32], "mainli": [31, 34], "maintain": 8, "major": 16, "make": [2, 5, 6, 7, 14, 15, 17, 21, 23, 28, 32, 33], "make_tupl": 17, "makefil": 5, "malloc": [14, 31, 33], "malloc_conf": [31, 33], "mamx": 17, "man": [7, 33], "manag": [2, 8, 13, 20, 28, 31], "mandatori": 14, "mani": [5, 14, 28, 31, 33, 34], "manipul": 18, "mantissa": 21, "manual": [2, 7, 10, 14, 18, 20, 34], "manual_se": [6, 11], "map": [2, 6, 18, 30], "mar": [3, 32], "margin_ranking_loss": 8, "mask": [2, 7, 17, 26], "mask_valu": 17, "maskrcnn": [33, 34], "maskrnn": 34, "master": [2, 7, 21, 31], "master_addr": 6, "master_port": 6, "match": [2, 8, 17, 31], "math": 7, "matmul": [2, 8, 13, 26, 34], "matrix": [1, 6, 7, 25, 28], "matrix_rank": 8, "matur": 34, "mavx2": 17, "mavx512bf16": 17, "mavx512bw": 17, "mavx512dq": 17, "mavx512f": 17, "mavx512fp16": 17, "mavx512vl": 17, "mavx512vnni": 17, "max": [2, 6, 16, 17, 22, 23, 26, 34], "max_context_len": 2, "max_new_token": [6, 23], "max_num_blocks_per_seq": 2, "max_position_embed": 2, "max_seq": 2, "max_seq_len": 30, "max_seq_length": [10, 34], "max_seqlen_k": 2, "max_seqlen_kv": 2, "max_seqlen_q": 2, "max_trial": 14, "max_unpool2d": 8, "max_unpool3d": 8, "maxim": 14, "maximum": [2, 16, 17], "maxpool": 34, "maxpool2d": 13, "maycontainalia": 5, "md": 18, "me": 18, "mean": [2, 16, 17, 18, 20, 22, 28, 34], "meant": 34, "meanwhil": [12, 33, 34], "measur": [30, 34], "mechan": [1, 7, 17, 21, 34], "medium": [28, 34], "meet": [21, 33, 34], "meltdown": 30, "membind": 33, "memori": [2, 6, 7, 8, 9, 10, 13, 19, 20, 21, 26, 28, 30, 32, 34], "memory_format": [6, 7, 18, 23], "mention": [3, 10, 20, 21, 34], "merg": [0, 7, 34], "merged_emb": 7, "merged_input": 7, "mergedembeddingbag": 7, "mergedembeddingbagwith": 7, "mergedembeddingbagwithsgd": 7, "merit": 18, "mermori": 2, "messag": [2, 6, 10, 12, 18, 31], "meta": [6, 18, 28, 29, 34], "metadata_thp": [31, 33], "method": [2, 8, 15, 16, 18, 22, 26, 33, 34], "method1": 10, "method2": 10, "methodologi": [2, 6, 7, 19, 33], "methond": 15, "metric": [2, 16, 30], "mfma": 17, "mha": [2, 34], "mhz": 33, "microarchitectur": [33, 34], "microsoft": [2, 28], "might": [2, 7, 18, 26, 33, 34], "migrat": 7, "millisecond": 33, "min": [2, 16, 22, 26, 34], "mind": [18, 32], "mini": [2, 20, 28, 34], "minim": [7, 14, 17, 33], "minimum": [14, 16, 18], "minmax": 34, "minmaxobserv": [2, 6, 15], "misc": 34, "mish": 13, "miss": 5, "mistral": [2, 28, 34], "mistralai": 28, "mitig": [20, 30], "mix": [2, 6, 13, 23, 26, 28, 34], "mixed_dtyp": 34, "mixtral": [2, 28], "mixtur": [8, 34], "mkdir": 6, "mkl": 34, "mkldnn": 18, "mkldnn_util": 18, "mllama": 2, "mlp": 34, "mm": 8, "mmuzzy_decay_m": 33, "mmx": 17, "mno": 17, "mobilenet": 30, "mode": [1, 2, 5, 7, 10, 12, 18, 20, 23, 26, 32, 34], "model": [1, 2, 3, 4, 8, 9, 10, 11, 12, 14, 16, 23, 24, 25, 26, 29, 30, 33, 34], "model1": 20, "model2": 20, "model_execut": 34, "model_id": [6, 23], "model_log": 32, "model_name_or_path": [10, 29, 34], "model_script": 20, "model_service_work": 32, "model_state_dict": 6, "model_stor": 32, "model_to_be_calibr": 34, "modelfamili": 28, "modeling_llama": 2, "modelurl": 32, "modern": 3, "modifi": [2, 5, 6], "modul": [1, 6, 7, 8, 13, 16, 17, 26, 29, 31, 34], "modular": 2, "modulist": 7, "momentum": [6, 10, 21], "monkei": 10, "more": [1, 2, 5, 6, 7, 8, 10, 11, 13, 16, 17, 19, 20, 21, 23, 26, 28, 32, 33, 34], "moreov": [1, 2, 28], "mosaicml": 28, "most": [2, 6, 7, 13, 21, 28, 30, 32, 33, 34], "motherboard": 33, "motiv": [2, 20], "move": [18, 33], "movingaverageminmax": 34, "mp_size": 29, "mpi": 31, "mpiexec": 31, "mpt": [2, 28, 34], "mrdimm": 34, "mrpc": 30, "mse_loss": 8, "much": [15, 18, 21, 28, 31, 33], "mul": [2, 13, 16], "multi": [2, 7, 14, 20, 28, 31, 33, 34], "multi_margin_loss": 8, "multi_stream": 2, "multi_stream_input_hint": 34, "multi_stream_model": [20, 34], "multi_stream_output_hint": 34, "multidimension": 18, "multiheadattent": 28, "multilabel_margin_loss": 8, "multilabel_margin_loss_forward": 8, "multipl": [2, 5, 7, 8, 16, 17, 18, 26, 28, 30, 32, 33, 34], "multiplex": 34, "multipli": 2, "multistreammodul": [2, 7, 20, 26, 34], "multistreammodulehint": [2, 20, 34], "multithread": 33, "must": [2, 5, 14, 17, 19], "mutual": 31, "muzzy_decay_m": [31, 33], "my": 18, "mykernel": 17, "mymodel": 34, "mypi": 5, "n": [2, 6, 7, 16, 18, 19, 20, 26, 32, 33, 34], "n1": 18, "n2": 18, "n_iter": 32, "name": [2, 5, 7, 14, 17, 25, 28, 31, 32, 33, 34], "namespac": [8, 17], "nan": [17, 34], "nanquantil": 8, "narg": 6, "narrow": 5, "nativ": [1, 6, 7, 8, 17, 19, 21, 26, 28, 34], "natur": [18, 21, 28], "naver": 3, "nb": 18, "nc": 32, "nchw": [7, 33], "ncore": [10, 31], "ncore_per_inst": [14, 34], "ncores_per_inst": 14, "nd": 18, "necessari": 18, "necessarili": 2, "neck": 19, "need": [2, 5, 6, 7, 10, 13, 14, 16, 17, 18, 19, 20, 21, 23, 26, 29, 31, 32, 33, 34], "need_linearize_indices_and_offset": 7, "neelnanda": 6, "neg": 21, "neglig": 18, "neighbor": 2, "neox": [2, 28], "net": 34, "network": [1, 3, 7, 8, 20, 25, 28, 33], "neural": [1, 3, 7, 16, 22, 25, 28, 33, 34], "neuralnetwork": 16, "new": [3, 5, 12, 16, 17, 18, 20, 23, 26, 29, 33], "new_gelu": 2, "new_layer_past": 2, "newer": [1, 28, 33], "newgeluactiv": 2, "newkernel": 17, "newkernelkrnl": 17, "newli": 34, "newlin": 5, "next": [5, 7, 34], "nf4": [2, 29], "nhwc": [7, 33, 34], "nifti": 33, "ninstanc": [10, 14, 31, 34], "nint": 5, "nll_loss": 8, "nll_loss2d": 8, "nlp": [6, 7, 26, 30, 34], "nm": [7, 34], "nn": [2, 6, 7, 8, 10, 13, 15, 16, 18, 20, 26, 34], "nnc": 26, "nnode": 31, "no_grad": [4, 6, 10, 11, 12, 13, 15, 16, 20, 23, 26, 29, 32, 34], "node": [2, 20, 30, 32, 33, 34], "node0": 33, "node1": 33, "node_id": [2, 20, 31, 32, 34], "non": [2, 5, 8, 13, 18, 30, 32, 34], "noncontigu": 18, "none": [2, 6, 29, 31], "noqa": [6, 11, 12, 13, 16, 23, 29], "normal": [1, 2, 6, 7, 13, 20, 28, 33, 34], "normalized_shap": 2, "note": [2, 3, 5, 6, 15, 16, 17, 18, 20, 22, 24, 28, 30, 31, 32, 33], "notfound": 6, "noth": 2, "notic": [27, 31, 32], "nov": 3, "now": [2, 7, 15, 18, 32, 33, 34], "np": [16, 31], "nproc": 31, "nth": [32, 33], "num": [2, 20, 32, 33, 34], "num_attention_head": 6, "num_beam": [6, 23], "num_block": 2, "num_featur": 7, "num_head": 2, "num_hidden_lay": 6, "num_kv_head": 2, "num_nod": 14, "num_seq": 2, "num_stream": [2, 20, 34], "num_token": 2, "num_train_epoch": [10, 34], "numa": [2, 20, 31, 32, 34], "numactl": [20, 31, 32], "number": [1, 2, 5, 6, 7, 14, 16, 19, 20, 21, 26, 32, 34], "numer": [2, 8, 33], "numpi": 16, "o": [6, 17, 23, 30, 34], "o0": [2, 26, 34], "o1": [2, 26, 34], "o3": 17, "object": [2, 6, 7, 14, 17, 20, 33, 34], "observ": [2, 9, 13, 15, 34], "obsev": 15, "obtain": 16, "obviou": 28, "occupi": 26, "occur": 34, "occurr": 28, "off": [7, 8, 21, 28, 30, 34], "offer": [1, 5, 33], "offici": [5, 32, 33, 34], "offlin": 34, "offset": [2, 18, 28], "often": 7, "old": 34, "omp": [20, 26, 31, 32, 33, 34], "omp_num_threa": 26, "omp_num_thread": [20, 26, 31, 32, 34], "omp_proc_bind": [31, 33], "omp_schedul": [31, 33], "omp_set_num_thread": 34, "onboard": [19, 33], "onc": [2, 5, 6, 14, 17, 18, 20, 21, 32, 33], "ondevic": 29, "one": [2, 5, 7, 12, 13, 14, 16, 18, 19, 20, 26, 29, 31, 33, 34], "oneapi": [6, 33], "oneccl": [3, 6, 31, 34], "oneccl_bindings_for_pytorch": 6, "onednn": [2, 3, 13, 17, 26, 28, 34], "onednn_primitive_cache_capac": 33, "onednn_verbos": 4, "ones": [2, 6, 17], "onli": [1, 2, 5, 7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 20, 21, 26, 28, 31, 32, 34], "onlyquantizationint4": 28, "onlyquantizationint8": 28, "oob": [10, 34], "op": [2, 7, 15, 16, 22, 28, 34], "op_type_dict": 2, "open": [1, 16, 28, 33], "openai": 28, "openmp": [2, 7, 20, 26, 30, 32, 34], "oper": [1, 2, 6, 8, 13, 15, 21, 32, 33, 34], "opportunit": 2, "opt": [2, 6, 17, 28], "optdecoderlay": 16, "optim": [1, 3, 4, 6, 8, 9, 11, 12, 14, 16, 18, 20, 21, 23, 25, 26, 31, 32, 33, 34], "optimize_lstm": 2, "optimize_transform": 34, "optimized_model": [2, 34], "optimized_optim": 2, "optimizer_state_dict": 6, "optimum": 10, "optin": 2, "option": [1, 2, 5, 7, 10, 14, 15, 16, 29, 31, 34], "optyp": 2, "order": [2, 17, 18, 21, 31, 33, 34], "org": [2, 7, 16, 26, 34], "organ": 18, "orgqr": 8, "origin": [2, 6, 7, 12, 13, 15, 17, 20, 29, 34], "original_max_position_embed": 2, "original_model": 2, "ormqr": 8, "other": [2, 6, 7, 8, 14, 17, 18, 19, 23, 28, 31, 33], "other_1": 2, "other_2": 2, "other_arg": 19, "otheriws": 13, "otherwis": [2, 7, 20], "our": [5, 16, 19, 28, 33, 34], "out": [2, 5, 6, 7, 8, 10, 13, 16, 19, 20, 30, 31, 33, 34], "outlier": [7, 16], "outplac": [18, 34], "output": [2, 6, 7, 8, 13, 14, 16, 18, 23, 26, 34], "output_concat_hint": [2, 20], "output_dir": [10, 14, 34], "output_hint": 20, "output_tokens_length": [6, 23], "outsid": 20, "outstand": 5, "over": [5, 7, 8, 9, 16, 18, 30, 31, 34], "overal": 33, "overflow": [26, 34], "overhead": [1, 2, 7, 10, 19, 20, 26, 28, 33, 34], "overlap": 32, "overrid": 15, "overridden": [2, 17], "oversize_threshold": [31, 33], "overview": [7, 25, 34], "overwrit": [2, 31], "own": [2, 6, 15, 28], "owner": 13, "p": 34, "p29": 30, "p90": 30, "pack": [2, 20, 34], "packag": [1, 2, 5, 6, 7, 10, 23, 25, 26, 32, 33, 34], "pad": [8, 10, 20, 34], "pad_max": 6, "pad_val": 6, "padding_mod": 34, "page": [2, 6, 13, 20, 24, 29, 30, 33, 34], "pagedattent": [2, 34], "paper": [2, 34], "parallel": [2, 5, 6, 7, 28, 33, 34], "param": [2, 19, 31], "param_i": 19, "param_n": 19, "paramet": [2, 6, 7, 8, 10, 16, 17, 19, 20, 21, 26, 28, 29, 30, 31, 33, 34], "parse_arg": [6, 23], "parser": [6, 23], "part": [3, 5, 7, 8, 18, 21, 26, 31, 33, 34], "parti": 34, "partial": 7, "particular": [5, 6, 8, 29, 34], "partit": [13, 33], "pass": [1, 2, 5, 10, 17, 20, 26, 32, 34], "past": 28, "past_key_valu": [2, 6], "past_kv_length": 2, "patch": [10, 34], "path": [2, 6, 7, 14, 18, 20, 23, 31, 33, 34], "pattern": [7, 11, 18, 28, 34], "pdf": 2, "pdropout": 2, "peak": [2, 7, 11, 34], "penal": 33, "pend": 34, "per": [2, 10, 15, 16, 20, 30, 31, 32, 33, 34], "per_batch": 2, "per_batch_ic_block": 2, "per_batch_ic_block_sym": 2, "per_channel_symmetr": [2, 6, 15], "per_device_train_batch_s": [10, 34], "per_ic_block": 2, "per_tensor": 2, "per_tensor_affin": [6, 15, 34], "per_tensor_symmetr": 15, "perchannelminmaxobserv": [2, 6, 15], "perf": [11, 18], "perform": [1, 2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 15, 16, 18, 19, 21, 25, 28, 29, 31], "period": 33, "person": 3, "perspect": [2, 13, 18, 21, 28, 31, 33], "pertain": 17, "phase": [2, 20], "phi": [2, 28, 34], "physic": [2, 14, 20, 32, 33], "pick": 5, "piec": [2, 20], "pile": 6, "pin": [2, 20], "pinvers": 8, "pip": [4, 5, 33, 34], "pip3": 34, "place": [2, 8, 28, 33, 34], "placeholderobserv": [6, 15], "placement": 33, "plai": [7, 33], "plan": [5, 7, 10], "platform": [3, 7, 18, 32, 33, 34], "platinum": [14, 30, 32, 33], "pleas": [2, 6, 7, 11, 16, 22, 26, 28, 29, 31, 33, 34], "plu": 33, "pmi_rank": 6, "pmi_siz": [6, 29], "point": [2, 6, 8, 15, 21, 33, 34], "pointer": 17, "poisson_nll_loss": 8, "polar": 8, "polici": 33, "polish": 34, "polymorph": 17, "pool": [2, 20, 34], "poor": [26, 34], "popular": [1, 7, 22, 28, 30, 34], "popup": 5, "port": 31, "portabl": 11, "portion": 16, "pos_embd_dim": 2, "posit": [2, 28, 33, 34], "position_id": [2, 6], "position_ids_pad": 6, "possibl": [2, 14, 15, 19, 28, 31, 33, 34], "post": [2, 4, 5, 7, 15, 28, 34], "potenti": [3, 7, 34], "pow": 13, "power": [2, 7, 33, 34], "ppn": 31, "pr": [7, 18, 34], "practic": [6, 21, 24, 28, 33], "pragma": 17, "pre": [2, 28, 34], "precis": [2, 4, 6, 13, 21, 23, 26, 30, 34], "pred": 16, "predefin": 2, "predict": 16, "prefer": [1, 7, 8, 15, 24], "prefetchw": 17, "prefetchwt1": 17, "prefil": [2, 34], "prefix": [31, 34], "preload": [2, 31], "prepack": [2, 6, 10, 18, 26, 34], "prepar": [2, 4, 6, 13, 16, 26, 29, 32, 34], "prepared_model": [2, 4, 6, 13, 15, 16, 26, 29, 34], "prerequisit": [5, 6], "present": 32, "pretrain": [6, 32, 34], "pretti": 33, "prevent": 19, "previou": [14, 16, 18, 33, 34], "previous": 32, "primari": 33, "primarili": [8, 34], "primit": [11, 20, 30, 34], "principl": [3, 18], "print": [6, 11, 12, 13, 14, 16, 17, 23, 31], "printf": 5, "prior": [2, 23], "privat": 34, "probabl": 2, "problem": [7, 19, 26, 32, 33], "proc": 31, "procedur": 32, "process": [2, 6, 7, 11, 12, 14, 16, 19, 20, 21, 26, 31, 32, 33], "processor": [3, 7, 19, 21, 28, 30, 33, 34], "proclist": 33, "prod": 8, "produc": [5, 8], "product": [1, 2, 7, 14, 28, 34], "program": [1, 5, 7, 11, 20, 31, 33, 34], "progress": [26, 28, 34], "project": [1, 6], "prompt": [4, 6, 23, 34], "propag": [13, 21, 33], "proper": 34, "properli": 31, "properti": [6, 32], "propos": [5, 7, 11, 16, 18, 21], "prototyp": [4, 13, 20, 26, 34], "provid": [1, 2, 5, 6, 7, 8, 11, 12, 13, 14, 16, 20, 22, 24, 26, 28, 29, 31, 32, 33, 34], "pseudo": [19, 21, 34], "pseudocod": [26, 34], "pt": [6, 13, 14, 15, 23, 32, 34], "pth": 6, "pthread": 20, "ptmalloc": 32, "ptq": 7, "public": 34, "publish": 34, "pull": 5, "purlei": 33, "purpos": [17, 31, 32, 33], "push": 34, "push_back": 6, "put": 33, "py": [2, 5, 10, 14, 20, 31, 32, 34], "pyg": 3, "pyi": 5, "pypi": [26, 34], "python": [1, 2, 4, 10, 14, 17, 20, 26, 28, 29, 31, 32, 33, 34], "pytorch": [2, 3, 4, 6, 7, 8, 9, 10, 13, 14, 16, 17, 20, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34], "q": [2, 28], "qa": [10, 34], "qconf_summari": [6, 15, 16, 29], "qconfig": [2, 4, 6, 13, 16, 26, 29, 32, 34], "qconfig_map": 6, "qconfig_summary_fil": [2, 6, 29], "qconfig_summary_file_path": 29, "qconfigmap": 6, "qint8": [2, 6, 15], "qkv": 34, "qparam": 15, "qr": 8, "qscheme": [2, 6, 15, 34], "qualiti": 34, "quant": [2, 16], "quant_method": 2, "quant_stat": 15, "quantconf": 34, "quantil": 8, "quantiz": [1, 3, 4, 13, 22, 26, 28, 30, 32, 34], "quantizat": 2, "quantization_config": [2, 6, 29], "quantize_per_tensor": 26, "quantized_model": [13, 15, 34], "queri": [2, 17, 18], "query_roteri": 2, "query_token": 2, "question": [18, 30], "quick": [1, 20, 24, 25], "quick_check": 5, "quickli": 2, "quicklint": 5, "quickstart_tutori": 16, "quint8": [6, 15], "quit": [17, 34], "qwen": [2, 28, 34], "qwen2": [28, 34], "r": [5, 6, 7, 14, 23, 30, 32, 33], "rais": [2, 10], "rand": [6, 8, 12, 13, 20, 26, 34], "randint": [6, 11, 32], "randn": [2, 10, 13, 16, 18, 32, 34], "random": 14, "rang": [1, 6, 7, 15, 16, 19, 21, 26, 31, 32, 34], "rank": [6, 31, 34], "rapid": 3, "rate": 21, "rather": [2, 18], "ratio": [22, 30, 34], "raw": 2, "rc": 34, "rc3": 34, "re": [5, 8, 32, 33, 34], "reach": 34, "read": [7, 19], "readm": 34, "real": [2, 7, 14, 15, 30, 34], "realli": 5, "realtim": 30, "reason": [2, 10, 18, 20, 34], "rebas": [5, 34], "receip": [16, 20], "receipt": 20, "receiv": 21, "recent": [6, 7, 18], "recip": [2, 4, 7, 13, 15, 26, 28, 34], "recognit": 33, "recommend": [1, 5, 6, 7, 9, 10, 15, 16, 20, 23, 30, 31, 33, 34], "record": [14, 32], "recov": 21, "recurs": 5, "reduc": [1, 2, 7, 15, 19, 20, 21, 22, 26, 28, 33, 34], "reduce_rang": 15, "reduct": 34, "refer": [1, 7, 9, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 32, 34], "refin": 34, "reflection_pad1d": 8, "reflection_pad2d": 8, "regard": 13, "regardless": [8, 34], "region": [2, 8, 17, 33], "regist": [1, 7, 10, 16, 17, 34], "registr": 7, "regress": [9, 34], "regular": [6, 21], "reinstal": [5, 26], "reinterpret": 18, "reinterpret_cast": 17, "rel": [2, 4, 16, 31, 34], "relat": [2, 6, 13, 17, 31, 33, 34], "releas": [1, 17, 18, 26, 30, 33], "reli": [18, 20], "relu": [2, 7, 13, 16, 18, 26, 34], "relu6": 34, "remain": 32, "remaind": [2, 20], "remark": [26, 30, 33], "remot": 33, "remov": [2, 5, 21, 34], "reorder": [2, 18, 28], "reorder_cach": 28, "repeat": [10, 18, 21], "repeatedli": 5, "replac": [2, 5, 7, 10, 26, 34], "replace_dropout_with_ident": 2, "replication_pad1d": 8, "replication_pad2d": 8, "replication_pad3d": 8, "repo": [5, 6, 7], "repo_root": 29, "report": [1, 17], "repres": [5, 7, 21], "represent": 18, "reproduc": 32, "request": [1, 5, 20, 32], "requir": [2, 5, 6, 8, 10, 16, 18, 21, 26, 28, 29, 31, 32, 34], "research": 28, "reserv": 33, "reshape_and_cach": 2, "residu": 31, "resiz": [6, 13], "resnet18": 34, "resnet18_xpu": 34, "resnet34": [30, 34], "resnet3d": 34, "resnet50": [12, 13, 14, 18, 30, 31, 33, 34], "resnet50_weight": [6, 12, 13], "resnext": 30, "resnext101": [18, 34], "resnext3d": 34, "resolv": 34, "resourc": [13, 20, 28, 32, 33], "respect": [14, 16, 30, 31, 34], "respons": 30, "rest": 32, "restart": 32, "result": [1, 2, 6, 10, 12, 14, 16, 18, 20, 21, 30, 31, 32, 33], "retinanet": 34, "retriev": 33, "return": [2, 6, 7, 8, 10, 16, 17, 20, 26, 34], "return_dict_in_gener": 34, "return_softmax": 2, "return_tensor": [6, 23], "reus": [2, 33], "review": [7, 34], "rf": 5, "rfc": 18, "rh": 17, "right": [7, 21, 23, 28], "risk": 34, "rm": 5, "rms_norm": [2, 34], "rmsnorm": [2, 28, 34], "rmsnorm_modul": 2, "rn50": [13, 34], "rn50_int8_jit": 32, "rn50_ipex_int8": 32, "rnn": 34, "rnncell": 15, "rnnt": [26, 34], "ro": 2, "roberta": [26, 34], "roialign": [7, 34], "role": 33, "root": [6, 13, 16, 17, 28], "rope": [28, 34], "rope_modul": 2, "rotari": [2, 28], "rotary_dim": 2, "rotary_embed": [2, 34], "rotary_half": 2, "rotary_ndim": 2, "rotaryembed": [2, 34], "roughli": 18, "round": [13, 21], "rounding_bia": 17, "row": 7, "rst": 5, "rule": [21, 34], "run": [2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 26, 30, 31, 32, 33, 34], "run_20210712212258_inst": 31, "run_20210712212258_instance_0_cores_0": 31, "run_20210712214504_inst": 31, "run_20210712214504_instance_0_cores_22": 31, "run_20210712220928_inst": 31, "run_20210712220928_instance_0_cores_0": 31, "run_20210712221150_inst": 31, "run_20210712221150_instance_0_cores_0": 31, "run_20210712221150_instance_1_cores_22": 31, "run_20210712221305_inst": 31, "run_20210712221305_instance_0_cores_0": 31, "run_20210712221305_instance_1_cores_11": 31, "run_20210712221305_instance_2_cores_22": 31, "run_20210712221305_instance_3_cores_33": 31, "run_20210712221415_inst": 31, "run_20210712221415_instance_0_cores_0": 31, "run_20210712221415_instance_10_cores_40": 31, "run_20210712221415_instance_1_cores_4": 31, "run_20210712221415_instance_2_cores_8": 31, "run_20210712221415_instance_3_cores_12": 31, "run_20210712221415_instance_4_cores_16": 31, "run_20210712221415_instance_5_cores_20": 31, "run_20210712221415_instance_6_cores_24": 31, "run_20210712221415_instance_7_cores_28": 31, "run_20210712221415_instance_8_cores_32": 31, "run_20210712221415_instance_9_cores_36": 31, "run_20210712221615_inst": 31, "run_20210712221615_instance_0_cores_11": 31, "run_20210712223308_inst": 31, "run_20210712223308_instance_0_cores_0": 31, "run_20210713152500_instance_0_cores_0": 31, "run_20210713153048_instance_0_cores_0": 31, "run_20210713153333_instance_0_cores_0": 31, "run_20210713153659_instance_0_cores_0": 31, "run_20220106130151_instance_0_cores_0": 31, "run_benchmark": [26, 34], "run_qa": [10, 34], "runner": 5, "running_mod": 34, "runtim": [1, 8, 13, 17, 31, 33, 34], "runtimeerror": [26, 34], "s1": 20, "s7": 34, "s8": 34, "sacrif": 8, "sai": 5, "salesforc": 28, "same": [2, 5, 7, 10, 15, 16, 17, 18, 20, 21, 28, 31, 32, 33, 34], "same_model_execution_again": 34, "sampl": [2, 6, 9, 14, 16, 17, 29, 33], "sample_input": [2, 9, 34], "sample_text_captum_input": 32, "sampler": 6, "sampling_s": [2, 4, 16, 34], "sapphir": 3, "satisfi": [15, 26], "satur": 34, "save": [2, 5, 6, 7, 13, 14, 15, 16, 18, 21, 28, 32, 34], "save_qconf_summari": [6, 15, 16, 29], "scalabl": [3, 7, 21, 28, 30, 33, 34], "scalar": 2, "scalartyp": 17, "scalartypetocpptyp": 17, "scale": [2, 3, 6, 15, 28], "scale_attn": 2, "scaled_dot_product_attent": 2, "scatter": 31, "scenario": [2, 6, 7, 18, 33, 34], "schedul": [1, 2, 13, 20, 31, 33], "scheme": 32, "scope": [2, 7, 8, 21, 34], "script": [1, 2, 3, 4, 5, 6, 7, 8, 10, 14, 17, 20, 23, 24, 26, 28, 29, 30, 32, 33, 34], "scriptmodul": [2, 13, 20], "sdk": 34, "search": [1, 2, 4, 5, 7, 16, 22, 28, 31], "sec": 30, "second": [2, 10, 28, 32, 33], "secondli": 28, "secret": 18, "section": [1, 6, 7, 8, 14, 20, 23, 24, 25, 28, 29, 32, 33, 34], "secur": 3, "see": [1, 2, 5, 8, 14, 34], "seed": 2, "seen": 28, "segment": 34, "select": [2, 5, 7, 13, 24, 34], "self": [2, 6, 8, 10, 16, 20, 26, 34], "selu": 34, "semant": 18, "sens": 21, "sep": [3, 17], "separ": [7, 19, 27, 33], "seq_classification_artifact": 32, "seq_info": 2, "seq_len": [2, 30], "seq_length": [6, 11, 32], "seqlen_k": 2, "seqlen_q": 2, "sequenc": [2, 18, 21, 28, 34], "sequenti": 16, "seri": 33, "serv": [20, 34], "server": [32, 33], "servic": [6, 28, 30, 33], "session": 30, "set": [1, 2, 4, 5, 6, 7, 8, 14, 15, 16, 17, 21, 24, 26, 28, 30, 31, 32, 33, 34], "set_flush_denorm": 33, "set_format": 6, "set_glob": 6, "set_num_thread": [26, 34], "set_properti": 6, "sete": 15, "settensorexprfuseren": 26, "setup": [5, 6, 28, 34], "setup_config": 32, "setup_lint": 5, "sever": [2, 7, 10, 19, 30, 31, 34], "sgd": [2, 6, 7, 8, 16, 19], "sgemm": 34, "sha": 17, "shall": [5, 18, 33], "shape": [2, 6, 7, 16, 20, 23, 30, 33, 34], "shard": 28, "share": [1, 5, 6, 16, 20, 32, 33, 34], "share_weight_observ": 2, "shared_criterion": [16, 22], "sheet": 23, "shift": 21, "ship": 28, "short_factor": 2, "shortcut": 34, "shorten": 5, "shorter": [21, 28], "should": [2, 5, 8, 15, 20, 28, 31, 33], "show": [8, 17, 21, 28, 29, 30, 31, 32, 33, 34], "shown": [1, 6, 18, 28, 31, 32], "shuffl": 6, "shufflenet": 30, "shufflenetv2_x1": 30, "side": [15, 33], "sigmoid": [13, 34], "sign": 21, "signficantli": 32, "signific": 21, "significantli": [28, 34], "silu": [2, 13], "similar": [15, 17, 33], "similarli": 32, "simpl": [5, 7, 8, 11, 18, 33, 34], "simplenet": [8, 34], "simpli": [6, 7, 26, 31], "simplifi": [10, 34], "simultan": 20, "sin": 2, "sinc": [6, 7, 18, 19, 20, 21, 26, 33, 34], "sincer": 34, "singl": [2, 7, 13, 14, 16, 19, 20, 30, 32, 34], "single_query_cached_kv_attent": 2, "site": 32, "situat": [7, 14], "six": 33, "size": [2, 6, 7, 11, 15, 16, 17, 18, 23, 26, 28, 30, 32, 33, 34], "sizeof": 17, "skip": [5, 6, 17, 18], "skip_special_token": [6, 23], "skylak": 15, "sleef": 17, "sleep": 33, "slice": [6, 18], "sliu": 34, "slope": 2, "slot": [2, 30], "slot_map": 2, "slow": 34, "slower": [8, 33, 34], "small": [7, 19, 33, 34], "smaller": [8, 17], "smooth": 7, "smooth_l1_loss": 8, "smoothquant": [2, 6, 7, 16, 22, 28, 34], "smoothquant_arg": [2, 16], "snc": 34, "snippet": [10, 29], "so": [2, 5, 6, 7, 8, 15, 17, 18, 20, 30, 31, 32, 33, 34], "sock": 32, "socket": [14, 30, 32, 33, 34], "soft_margin_loss": 8, "softmax": [2, 13, 34], "softmax_scal": 2, "softwar": [3, 27, 34], "sole": 33, "solut": [2, 7, 26, 28, 34], "solv": [7, 19, 33], "some": [2, 5, 7, 8, 13, 16, 17, 18, 20, 26, 28, 31, 32, 33, 34], "someth": 18, "sometim": [31, 33], "sophist": 33, "sourc": [1, 5, 6, 17, 27, 28, 33, 34], "space": [2, 7, 16, 18, 22, 33], "spars": [7, 18, 34], "sparsiti": 2, "spawn": [7, 20], "special": [17, 18, 28], "specif": [1, 2, 5, 6, 7, 12, 18, 20, 26, 28, 31, 33, 34], "specifi": [2, 5, 6, 14, 20, 31, 33, 34], "specifii": 17, "spectr": 30, "speech": [3, 33], "speed": [2, 7, 11, 19, 28, 33, 34], "speedup": [2, 6, 8, 28, 30, 34], "sphinx": 5, "split": [2, 6, 7, 16, 17, 19, 20, 26, 34], "split_bf16_from_fp32": 21, "split_master_weight_for_bf16": 2, "splitsgd": [7, 21], "spontan": 18, "sqrt": [2, 13, 19], "squad": [10, 30, 34], "squar": [13, 28], "squenc": 2, "src": [2, 17], "src_data_ptr": 18, "src_md": 18, "src_mem": 18, "ssd": [30, 34], "sse": 17, "sse2": 17, "sse3": 17, "sse4_1": 17, "sse4_2": 17, "ssse3": 17, "stabil": [2, 8, 34], "stabilityai": 28, "stabl": [2, 3, 8, 34], "stablelm": [2, 28], "stack": [6, 8], "stage": [7, 10, 19, 20, 29, 33, 34], "stakehold": 34, "stall": 33, "standard": [1, 34], "stanford": 34, "starcod": [28, 34], "start": [1, 3, 4, 5, 6, 7, 10, 20, 24, 34], "start_dim": 20, "state": [2, 15, 19, 28], "state_dict": [2, 6, 34], "state_sum": 19, "state_sum_i": 19, "state_sum_n": 19, "statement": [14, 17], "static": [2, 4, 16, 26, 28, 31, 32, 33, 34], "static_quantized_model": 6, "staticquantizationint8": 28, "statist": 7, "statu": 17, "std": [6, 17, 19], "stdio": 5, "stdout": 31, "stead": 17, "steam": [20, 34], "step": [2, 5, 6, 7, 8, 14, 16, 19, 21, 32], "step_siz": [16, 22], "stft": 8, "stick": 7, "still": [2, 5, 7, 8, 13, 16, 18, 21, 26, 34], "stock": [13, 30, 34], "stop": [2, 33], "storag": 19, "store": [2, 17, 18, 19, 21, 28, 31, 32, 33, 34], "store_tru": [6, 23], "str": [2, 6, 14, 23, 31], "straight": [13, 33], "straightforward": 34, "strategi": [14, 31, 33, 34], "stream": [2, 7, 20, 34], "streamlin": 34, "strict": [6, 32], "stride": [8, 10, 20, 34], "stride_c": 18, "stride_h": 18, "stride_n": 18, "stride_w": 18, "string": [2, 31], "structur": [1, 18, 31, 34], "style": [2, 5], "sub": [20, 28, 33], "subfold": 17, "subgraph": 2, "subject": [7, 17, 20, 27, 34], "submit": [1, 5, 7, 20], "submodul": 5, "subsequ": [18, 33], "substr": 5, "success": [10, 24], "suffer": 20, "suffix": 17, "suggest": [1, 2, 15, 18, 20, 33, 34], "suit": 5, "sum": [13, 16, 18, 19, 34], "summar": 26, "summari": [6, 34], "super": [8, 10, 16, 20, 26, 34], "superset": 20, "suppli": 8, "support": [2, 5, 6, 7, 13, 15, 16, 17, 18, 19, 20, 21, 25, 26, 28, 29, 31, 32, 33, 34], "suppos": [2, 6, 14, 33], "sure": [5, 14, 15, 32, 33], "svd": 8, "sw": 30, "swish": 34, "switch": [7, 17, 31, 33, 34], "sy": 30, "sycl": 1, "symbol": 20, "symeig": 8, "symlink": 5, "symmetr": [2, 15, 34], "sync": [5, 20], "synchron": [20, 26, 34], "sysctl": 33, "system": [17, 33], "systemat": 7, "t": [2, 5, 7, 8, 14, 15, 16, 17, 18, 20, 26, 32, 34], "t5": [2, 26, 28, 34], "t_valu": 17, "tab": 5, "tabl": [2, 7, 17, 28, 30, 34], "tackl": 7, "tacotron2": 34, "take": [1, 2, 7, 8, 10, 12, 13, 14, 18, 21, 25, 26, 30, 31, 33], "taken": 32, "tanh": [13, 34], "target": [5, 6, 10, 13, 14, 17, 34], "target_link_librari": 6, "target_v": 14, "task": [2, 7, 28, 31, 33, 34], "task1": 20, "task2": 20, "taskset": 31, "tbd": 26, "tc": 14, "tcmalloc": 32, "te": 34, "team": [1, 5], "techniqu": [1, 2, 7, 11, 12, 28, 34], "technolog": [1, 7, 28], "technologi": [3, 7], "tee": 31, "tell": [18, 20, 31, 33], "temperatur": [6, 23], "tenosr": 2, "tensor": [2, 6, 7, 8, 11, 15, 16, 17, 20, 26, 28, 32, 34], "tensorexpr_fus": 26, "tensorflow": 18, "tensoriter": 18, "terabyt": 30, "term": 27, "termin": 14, "test": [7, 16, 17, 30, 34], "test_": 5, "test_alias_analysi": 5, "test_bceloss": 5, "test_data": 16, "test_dataload": 16, "test_jit": 5, "test_mseloss": 5, "test_nn": 5, "test_sequenti": 5, "testclassnam": 5, "testjit": 5, "testnam": 5, "testnn": 5, "testsuit": 5, "text": [3, 6, 26, 28, 30, 33, 34], "text_max_length": 2, "tgi": 34, "than": [2, 5, 7, 17, 18, 20, 21, 26, 31, 33, 34], "thank": [5, 34], "thei": [7, 8, 31, 33], "them": [1, 5, 7, 18, 19, 28, 31, 33], "themselv": [31, 34], "therefor": 33, "thi": [2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31, 34], "thing": [14, 33], "third": [19, 34], "those": [2, 15, 33], "though": [2, 7], "thrash": 33, "threa": 34, "thread": [1, 2, 7, 20, 26, 30, 31, 32, 33, 34], "three": [7, 16, 17], "threshold": 33, "through": [1, 2, 6, 7, 8, 12, 25, 28, 33, 34], "throughput": [2, 3, 18, 20, 26, 28, 30, 34], "thu": [2, 7, 8, 10, 18, 20, 21, 28, 31, 32, 33], "thudm": 28, "tidi": 5, "tightli": 34, "tiiuae": 28, "tile": 17, "time": [2, 5, 7, 14, 16, 17, 18, 19, 26, 28, 30, 33, 34], "timeout": [2, 5, 21], "timestamp": [2, 28], "tip": 17, "tmp": [10, 32, 34], "to_bfloat16_train": 7, "to_dens": 18, "to_mkldnn": 18, "togeth": [7, 14, 20, 33, 34], "toggl": 7, "token": [2, 6, 23, 28, 30], "tokenize_funct": 6, "tolist": 16, "tool": [17, 33, 34], "toolset": 17, "top": [10, 21, 34], "top1": 30, "toplevel": 5, "topologi": [7, 18, 19, 26, 30, 31, 33, 34], "torch": [1, 2, 4, 6, 8, 10, 11, 12, 13, 15, 16, 18, 20, 23, 26, 29, 32, 33, 34], "torch_ccl": 6, "torch_check": 17, "torch_dtyp": [6, 23], "torch_ipex": [17, 34], "torch_ipex_librari": 6, "torchconfig": 6, "torchdynamo": [1, 7, 12, 23, 34], "torchrun": 34, "torchscirpt": 2, "torchscript": [1, 2, 5, 7, 10, 11, 12, 19, 23, 26, 32, 34], "torchserv": [3, 34], "torchvis": [6, 10, 12, 13, 16, 18, 32, 34], "torchvison": 34, "total": [2, 6, 30, 33], "total_new_token": [6, 23], "totensor": [6, 13, 16], "tpp": 2, "trace": [1, 6, 7, 8, 12, 13, 15, 16, 20, 23, 26, 32, 34], "trace_model": 34, "traced_model": [6, 10, 13, 15, 16, 26, 34], "traced_model1": 20, "traced_model2": 20, "track": 1, "track_running_stat": 10, "trade": [8, 28, 30, 34], "tradeoff": 15, "trail": [5, 21], "train": [2, 3, 4, 7, 11, 13, 15, 16, 18, 21, 23, 26, 28, 31, 34], "train_dataload": 16, "train_dataset": [6, 13], "train_load": [6, 8], "training_data": 16, "transfer": 33, "transform": [2, 3, 4, 6, 10, 11, 13, 16, 18, 22, 23, 28, 29, 32, 33, 34], "transformer_handler_gener": 32, "transformerencoderlay": 26, "transnetv2": 34, "transpar": [2, 7, 29, 33, 34], "transpos": [13, 34], "tree": [5, 6], "tri": 12, "trial": 14, "triangular_solv": 8, "trigger": 12, "triplet_margin_loss": 8, "true": [2, 4, 6, 10, 12, 13, 14, 15, 16, 17, 22, 23, 31, 32, 33, 34], "trust_remote_cod": [6, 23], "truth": 21, "try": [2, 5, 6, 7, 12, 14, 16, 26, 31, 33, 34], "tunabl": [30, 32], "tune": [2, 3, 4, 7, 8, 15, 20, 26, 28, 29, 31, 32, 34], "tuned_conf": 16, "tuned_model": [4, 16, 34], "tunin": 32, "tuning_tim": [2, 4, 16, 34], "tupl": [2, 6, 17, 20], "turboboost": 30, "turn": [7, 34], "tutori": [5, 6, 15, 16, 29, 34], "two": [2, 7, 14, 16, 20, 21, 28, 32, 33, 34], "txt": [5, 6, 32], "type": [2, 4, 5, 6, 7, 10, 16, 17, 18, 20, 21, 23, 30, 31, 32, 34], "types": 17, "typic": [6, 10, 28, 33, 34], "u": [30, 32], "u7": 34, "u8": 34, "ubuntu": 30, "ucod": 30, "uint32_t": 17, "ultra": 33, "uma": 33, "unabl": 10, "unalign": [17, 34], "uncas": [4, 6, 10, 11, 32, 34], "undefin": [2, 20, 33], "under": [2, 6, 8, 18, 20, 27, 31, 34], "undergo": 26, "underhood": 34, "underli": [1, 17, 28], "underneath": 34, "understand": [21, 28, 33], "undesir": 31, "unexpect": 2, "unifi": [2, 31], "uniform": 32, "uninstal": 5, "union": 2, "unit": [1, 2, 33], "unittest": 5, "unix": 32, "unlik": 6, "unlist": 8, "unnecessari": 33, "unpack": [26, 34], "unpad": 2, "unpredict": 2, "unrel": 6, "unsign": 34, "unsqueez": 2, "unstabl": 8, "until": [5, 20, 21, 33], "untrack": 5, "unus": [31, 33], "unutil": 32, "up": [2, 3, 7, 11, 20, 24, 28, 33, 34], "updat": [2, 5, 7, 16, 19, 21, 22, 34], "upgrad": 34, "upi": 33, "upload": 34, "upper": [18, 33], "upsampl": [18, 34], "upstream": [7, 18, 34], "url": [32, 34], "us": [1, 2, 3, 4, 5, 6, 11, 14, 15, 17, 18, 19, 21, 23, 24, 25, 26, 27, 28, 32, 33, 34], "usabl": 34, "usag": [2, 6, 7, 8, 23, 25, 32, 33, 34], "use_all_nod": 14, "use_default_alloc": [32, 34], "use_logical_cor": [14, 32], "user": [1, 2, 7, 9, 10, 12, 13, 15, 16, 18, 20, 26, 31, 32, 33, 34], "user_model": [6, 15], "usr": [6, 17, 31, 32], "usual": [2, 18, 20, 33], "usuali": 33, "usus": 32, "ut": 31, "util": [1, 6, 7, 10, 13, 15, 16, 18, 21, 28, 31, 33, 34], "ux": 34, "v": 5, "v0": [28, 34], "v1": [28, 34], "v2": [28, 30, 34], "v3": 34, "valid": [2, 21, 34], "valu": [2, 6, 10, 14, 16, 17, 19, 20, 21, 22, 26, 28, 31, 32, 33, 34], "value_cach": 2, "value_token": 2, "var": 29, "vari": 16, "variabl": [2, 5, 17, 30, 31, 32, 33, 34], "varianc": 34, "variance_epsilon": 2, "variant": [2, 8, 28, 34], "variou": [6, 7, 14, 28, 33, 34], "varlen_attent": [2, 34], "varlenattent": [2, 34], "varlenattention_modul": 2, "ve": 34, "vec256": 17, "vec512": 17, "vec_bia": 17, "vector": [1, 2, 6, 17, 18, 25, 28], "vectors": 17, "verbos": [2, 4, 31], "verbose_off": 2, "verbose_on": 2, "verbose_on_cr": 2, "veri": [2, 5, 15, 18, 28], "verifi": [6, 7], "version": [6, 7, 16, 17, 25, 26, 27, 32, 33, 34], "vgg": 30, "vgg11": 30, "via": [2, 5, 6, 7, 18, 20, 30, 31, 33, 34], "video": 7, "view": [13, 18, 20, 21], "view_as_complex": 8, "virtual": 17, "virtual_env": [31, 32], "vision": [3, 6, 28, 30, 34], "visit": [7, 33], "vllm": [2, 34], "vm": 34, "vnni": [1, 15, 17, 25, 28], "vocab_s": [6, 11, 32], "voic": 33, "void": 17, "vstack": 6, "w": [7, 16, 18, 21, 30, 32], "wa": [7, 31, 32, 33, 34], "wai": [5, 10, 16, 18, 28, 34], "wait": [20, 33], "wake": 20, "walk": 34, "want": [2, 5, 7, 14, 15, 17, 20, 31, 34], "warm": 33, "warn": [5, 6, 12, 31, 32, 34], "wav2vec2": 33, "wave2vec": 34, "wc": 18, "we": [1, 2, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 23, 28, 30, 32, 33, 34], "web": 28, "webpag": 34, "websit": 7, "wei_ic_observ": 2, "wei_observ": 2, "weight": [1, 2, 7, 10, 12, 13, 15, 16, 18, 20, 22, 23, 26, 28, 34], "weight_dacai": 21, "weight_decai": [7, 19], "weight_dtyp": [2, 6, 29], "weight_qschem": 2, "weights_prepack": [2, 6, 7, 23, 26], "well": [1, 2, 5, 6, 7, 11, 16, 20, 21, 24, 28, 32, 33, 34], "were": [30, 31, 32, 33], "west": 30, "what": [3, 5, 6, 8, 23], "wheel": 34, "when": [2, 5, 6, 7, 8, 9, 14, 18, 19, 20, 21, 22, 25, 26, 28, 30, 31, 32, 33, 34], "where": [2, 5, 7, 16, 21, 33, 34], "wherea": 30, "whether": [2, 6, 8, 16, 18, 22, 23, 33], "which": [1, 2, 5, 7, 8, 10, 14, 15, 16, 17, 18, 20, 26, 28, 30, 31, 32, 33, 34], "while": [2, 7, 8, 11, 12, 18, 21, 26, 28, 31, 32, 33, 34], "whisper": [2, 28, 34], "whl": 34, "who": 10, "whole": [19, 20, 33], "wide": [21, 34], "wider": 1, "widespread": [1, 7, 28], "width": [17, 18], "wikipedia": [13, 33], "wise": [2, 16, 19, 22, 29, 34], "wish": [5, 7], "with_arg": [2, 6, 15], "within": [5, 16, 21, 29, 33, 34], "without": [2, 5, 6, 7, 8, 10, 16, 20, 21, 26, 32, 34], "wlydcrb1": 30, "wn": 18, "won": [2, 7, 8, 17, 26], "woq": [2, 28], "woqactquantmod": 2, "woqlowpmod": [2, 6, 29], "woqweightdtyp": [2, 6, 29], "woqweightqschem": 2, "work": [2, 5, 6, 7, 14, 15, 17, 20, 26, 28, 29, 31, 33, 34], "workabl": 2, "workaround": [26, 34], "worker": [20, 31], "workflow": 34, "workload": [1, 6, 7, 8, 10, 11, 12, 21, 26, 28, 29, 30, 31, 33, 34], "workload1": 30, "workspac": 6, "world": [5, 7], "world_siz": [6, 29], "worri": 32, "wors": 2, "worth": 34, "would": [2, 5, 6, 14, 16, 17, 18, 30, 31, 32, 33, 34], "wrap": 34, "write": [7, 17], "written": [5, 6, 17], "x": [1, 2, 5, 6, 8, 10, 13, 15, 16, 17, 18, 20, 21, 23, 26, 34], "x1": 20, "x2": 20, "x86": 3, "x86_64": 30, "xcr0": 17, "xdf": 5, "xe": 33, "xeon": [3, 7, 14, 21, 28, 30, 32, 33, 34], "xl": 28, "xlm": 26, "xmx": 1, "xpu": [1, 2, 3, 34], "xsave": 17, "xx": 6, "xx_c": 34, "xx_v": 34, "y": [8, 15, 16, 20, 21, 34], "y1": 20, "y1_futur": 20, "y2": 20, "y2_futur": 20, "y_runtim": 20, "yaml": 14, "ye": 5, "year": 28, "yet": [2, 6, 26, 34], "yield": [1, 7, 33], "yolov3": 34, "you": [1, 2, 5, 6, 7, 8, 13, 14, 15, 17, 18, 20, 23, 25, 26, 28, 29, 31, 33, 34], "your": [1, 5, 6, 7, 8, 10, 14, 15, 20, 23, 24, 26, 27, 28, 29, 34], "your_calibration_dataset": 29, "your_conf_fil": [4, 34], "your_generation_param": 34, "your_python_script": [4, 34], "your_pytorch_script": [4, 31], "yuan": [2, 28], "yuan2": 28, "z11pa": 33, "zero": [6, 15, 34], "zero_grad": [6, 7, 16], "zero_tensor": 2, "zip": [6, 23, 34], "zone": [30, 34], "zoo": [6, 30], "\u03b1": 21}, "titles": ["Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc", "Intel\u00ae Extension for PyTorch*", "API Documentation", "Blogs & Publications", "Cheat Sheet", "Contribution", "Examples", "Features", "Auto Mixed Precision (AMP)", "Auto Channels Last", "Codeless Optimization (Prototype)", "Fast BERT (Prototype)", "Graph Capture (Prototype)", "Graph Optimization", "HyperTune (Prototype)", "Intel\u00ae Extension for PyTorch* optimizations for quantization", "INT8 Recipe Tuning API (Prototype)", "ISA Dynamic Dispatching", "Channels Last", "Optimizer Fusion", "Runtime Extension", "Split SGD", "Smooth Quant Recipe Tuning API (Prototype)", "Quick Start", "Installation", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimization Overview", "LLM Optimizations Frontend API", "Performance", "Launch Script Usage Guide", "TorchServe with Intel\u00ae Extension for PyTorch*", "Performance Tuning Guide", "Releases"], "titleterms": {"": 34, "0": [6, 7, 34], "1": [7, 14, 32, 34], "10": [30, 34], "100": 34, "11": [30, 34], "12": 34, "13": [7, 34], "2": [6, 7, 14, 32, 34], "200": [30, 34], "2xlarg": 30, "3": [32, 34], "300": 34, "4": [32, 34], "5": 34, "8": 34, "9": 34, "That": 18, "The": 10, "__call__": 10, "access": [28, 33], "accuraci": 30, "add": 17, "ai": [6, 30], "algorithm": 16, "all": [18, 31], "alloc": [31, 33], "alpha": [16, 34], "alreadi": 10, "amp": [7, 8], "an": 30, "api": [2, 7, 9, 13, 16, 17, 18, 22, 25, 28, 29], "appli": 10, "architectur": 1, "archiv": 32, "asynchron": 20, "aten": [17, 18], "attr": 10, "auto": [7, 8, 9, 16, 20], "autocast": 8, "autotun": 16, "aw": 30, "b": 18, "basic": 20, "behavior": 8, "benchmark": 32, "bert": [2, 6, 7, 11, 32], "beta": [6, 7], "better": 5, "bf16": [6, 10, 13, 29], "bfloat16": [6, 8, 21, 26, 30], "bind": 20, "block": 18, "blog": 3, "boost": 32, "build": [5, 17], "c": [5, 6, 18], "c6i": 30, "cach": [28, 33], "calibr": [6, 15], "can": 8, "captur": [7, 12], "case": [8, 10, 20], "center": 30, "chang": 34, "channel": [7, 9, 18, 33], "cheat": 4, "check": 17, "code": 17, "codegen": 17, "codeless": [7, 10], "command": 10, "common": 29, "compil": [7, 17], "configur": [20, 30, 33], "content": [32, 33], "contribut": 5, "convers": 18, "convert": 15, "convolut": 18, "core": [20, 31, 32], "correct": 26, "coverag": 18, "cpp": 17, "cpu": [0, 2, 17, 18, 33], "creat": [18, 32], "creation": 18, "csrc": 17, "custom": [17, 28], "d": 18, "data": [28, 30], "debug": [5, 17], "deepspe": [28, 29], "default": [8, 9, 14, 18, 31], "defin": [14, 15], "demo": 28, "denorm": 33, "deploi": [15, 32], "deploy": 6, "descent": 21, "descript": [11, 12], "design": [0, 17, 20, 31], "detail": 20, "determin": 16, "develop": 5, "disabl": 9, "dispatch": [0, 7, 17], "dispatchstub": 17, "distribut": [6, 28, 29], "do": 15, "doc": 0, "document": [2, 5, 25, 32, 33], "dure": 20, "dynam": [0, 6, 7, 15, 17, 26], "dyndisp": 17, "eager": [6, 8], "eas": [9, 13], "easi": 7, "ec2": 30, "elig": 8, "enabl": 9, "exampl": [6, 10, 11, 12, 14, 16, 17, 20, 31], "examples1": 20, "examples2": 20, "examples3": 20, "explicitli": 10, "export": 32, "extens": [0, 1, 5, 7, 15, 20, 26, 32], "fast": [2, 6, 7, 11], "featur": [6, 7, 11, 12, 17], "file": 32, "fix": 16, "float32": [6, 8], "fold": 13, "folder": 17, "format": 18, "forward": 10, "fp32": [6, 10, 13, 29, 30], "from": [6, 7], "frontend": 29, "fusion": [13, 19], "gener": [2, 26], "get": 25, "gnu": [31, 33], "gradient": 21, "graph": [2, 7, 12, 13, 28], "guid": [31, 33], "h": 17, "hardwar": [30, 33], "highlight": 34, "how": 20, "huggingfac": 10, "hyperparamet": 14, "hypertun": [7, 14], "i": [18, 20, 31], "ii": 31, "iii": 31, "implement": [17, 20], "improv": 34, "includ": 31, "index": 31, "indirect": 28, "infer": [6, 8, 28, 29, 31, 32], "input": [8, 20], "instal": [24, 32], "instanc": [28, 30, 31], "instead": 10, "int4": 6, "int8": [6, 7, 13, 16, 26, 30, 32], "intel": [0, 1, 5, 6, 15, 30, 31, 32, 33], "intrin": 17, "introduct": [8, 19, 25], "iomp": 20, "ipex": [10, 28], "isa": [0, 7, 17], "issu": [9, 20, 34], "iv": 31, "jemalloc": [31, 33], "jit": 10, "kernel": [17, 18], "known": [9, 20, 34], "kv": 28, "languag": [6, 7, 28], "larg": [6, 7, 28], "last": [7, 9, 18, 33], "latenc": 31, "launch": [10, 31], "launcher": [14, 32], "layout": 18, "level": [2, 17, 28], "librari": 31, "licens": 27, "linear": 28, "lint": 5, "list": 28, "llm": [2, 6, 7, 23, 28, 29, 30], "load": 20, "local": 5, "logic": 31, "low": 28, "manner": 18, "manual": 17, "matter": 18, "memori": [18, 31, 33], "method": 10, "methodologi": [13, 28], "mix": [7, 8], "mode": [6, 28, 31], "model": [6, 7, 13, 15, 18, 20, 28, 32], "modul": [2, 10, 20, 28], "motiv": 10, "multi": 32, "multipl": 31, "multistream": 20, "nativ": 18, "nchw": 18, "nchw16c": 18, "new": [6, 7, 34], "nhwc": 18, "node": 31, "non": 33, "note": 34, "numa": 33, "numactl": 33, "number": [30, 31, 33], "omp_num_thread": 33, "omp_thread_limit": 33, "onednn": [18, 33], "onli": [6, 29], "op": 8, "openmp": [31, 33], "oper": [7, 18, 19, 28], "optim": [2, 7, 10, 13, 15, 19, 28, 29], "origin": 10, "other": 34, "output": 20, "overview": [17, 28, 30, 31, 33], "path": 8, "pattern": 13, "perform": [20, 26, 30, 32, 33, 34], "physic": 31, "pin": 32, "precis": [7, 8, 28], "preload": 20, "prepar": 15, "prerequisit": 11, "primit": [18, 33], "privat": 17, "process": 17, "product": 30, "promot": 8, "prototyp": [2, 6, 7, 10, 11, 12, 14, 16, 22, 28], "pseudocod": 29, "public": 3, "pytest": 5, "python": [5, 6, 7], "pytorch": [0, 1, 5, 15, 18, 32], "qconfig": 15, "quant": 22, "quantiz": [2, 6, 7, 15, 16, 29], "quick": 23, "recip": [16, 20, 22], "refer": [6, 8], "regist": [18, 32], "regress": 26, "releas": 34, "requir": [17, 20], "resnet50": [6, 32], "result": [26, 34], "runtim": [2, 7, 20, 26], "scale": 32, "scenario": 29, "script": 31, "search": 14, "select": 17, "serial": 32, "serv": 32, "set": 20, "sgd": 21, "shape": 26, "sheet": 4, "singl": [28, 31], "smooth": [6, 16, 22], "smoothquant": 29, "softwar": [30, 33], "space": 14, "specif": [8, 17], "split": 21, "start": [23, 25, 32], "static": [6, 15], "statu": 18, "stochast": 21, "stride": 18, "struct": 17, "structur": [20, 33], "stub": 17, "support": [1, 8, 10], "target": 18, "task": 20, "tcmalloc": [31, 33], "tensor": 18, "test": 5, "thi": [32, 33], "through": 16, "throughput": 31, "tip": 5, "torch": 7, "torchdynamo": [6, 26], "torchscript": [6, 8], "torchserv": 32, "trace": 10, "train": [6, 8], "troubleshoot": 26, "tune": [14, 16, 22, 33], "type": [8, 28], "uniform": 33, "unit": 5, "us": [7, 8, 9, 10, 13, 16, 20, 31], "usag": [10, 11, 12, 14, 16, 20, 26, 29, 31], "user": 14, "v": 31, "v1": 30, "vec": 17, "verifi": 28, "version": 30, "vi": 31, "via": 28, "vii": 31, "viii": 31, "weight": [6, 29], "what": [18, 34], "widest": 8, "wip": 18, "woq": 29, "worker": 32, "write": [5, 18], "xyz": 17, "xyzkrnl": 17, "your": 31, "your_conf_fil": 14, "your_python_script": 14}})
\ No newline at end of file
diff --git a/cpu/2.5.0+cpu/tutorials/api_doc.html b/cpu/2.5.0+cpu/tutorials/api_doc.html
index 2f56f6335..81944c4d8 100644
--- a/cpu/2.5.0+cpu/tutorials/api_doc.html
+++ b/cpu/2.5.0+cpu/tutorials/api_doc.html
@@ -1751,7 +1751,7 @@ Graph Optimization using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/blogs_publications.html b/cpu/2.5.0+cpu/tutorials/blogs_publications.html
index 099e42d92..c20682bfd 100644
--- a/cpu/2.5.0+cpu/tutorials/blogs_publications.html
+++ b/cpu/2.5.0+cpu/tutorials/blogs_publications.html
@@ -167,7 +167,7 @@ Blogs & Publications using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/cheat_sheet.html b/cpu/2.5.0+cpu/tutorials/cheat_sheet.html
index 3551797d4..24b2ee3bf 100644
--- a/cpu/2.5.0+cpu/tutorials/cheat_sheet.html
+++ b/cpu/2.5.0+cpu/tutorials/cheat_sheet.html
@@ -195,7 +195,7 @@ Cheat Sheet using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/contribution.html b/cpu/2.5.0+cpu/tutorials/contribution.html
index 9c4077718..9c1b5789e 100644
--- a/cpu/2.5.0+cpu/tutorials/contribution.html
+++ b/cpu/2.5.0+cpu/tutorials/contribution.html
@@ -331,7 +331,7 @@ Tips<
Built with Sphinx using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/examples.html b/cpu/2.5.0+cpu/tutorials/examples.html
index cf225776b..8f595e425 100644
--- a/cpu/2.5.0+cpu/tutorials/examples.html
+++ b/cpu/2.5.0+cpu/tutorials/examples.html
@@ -636,7 +636,7 @@ Fast Bert (Prototype)
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
LLAMA |
@@ -183,16 +183,16 @@ Verified for single instance mode🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
LLAMA |
meta-llama/Meta-Llama-3-70B |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
🟩 |
🟩 |
@@ -201,10 +201,28 @@ Verified for single instance mode🟩 |
- 🟨 |
+ 🟩 |
🟩 |
🟩 |
+
+ LLAMA |
+ meta-llama/Llama-3.2-3B-Instruct |
+ 🟩 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
+
+
+ LLAMA |
+ meta-llama/Llama-3.2-11B-Vision-Instruct |
+ 🟩 |
+ 🟩 |
+ |
+ 🟩 |
+ |
+
GPT-J |
EleutherAI/gpt-j-6b |
@@ -218,19 +236,19 @@ Verified for single instance mode🟨 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
DOLLY |
databricks/dolly-v2-12b |
🟩 |
- 🟨 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
+ 🟩 |
FALCON |
@@ -239,7 +257,7 @@ Verified for single instance mode🟩 |
🟩 |
- |
+ 🟩 |
FALCON |
@@ -248,7 +266,7 @@ Verified for single instance mode🟩 |
🟩 |
- 🟨 |
+ 🟩 |
FALCON |
@@ -266,7 +284,7 @@ Verified for single instance mode🟩 |
🟩 |
- 🟨 |
+ 🟩 |
OPT |
@@ -275,16 +293,16 @@ Verified for single instance mode🟩 |
🟩 |
- 🟨 |
+ 🟩 |
Bloom |
bigscience/bloom-1b7 |
🟩 |
- 🟨 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
CodeGen |
@@ -302,34 +320,34 @@ Verified for single instance mode🟩 |
🟩 |
- 🟨 |
+ 🟩 |
Baichuan |
baichuan-inc/Baichuan2-13B-Chat |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Baichuan |
baichuan-inc/Baichuan-13B-Chat |
🟩 |
- 🟨 |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
ChatGLM |
THUDM/chatglm3-6b |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
ChatGLM |
@@ -338,23 +356,23 @@ Verified for single instance mode🟩 |
🟩 |
- 🟨 |
+ 🟩 |
GPTBigCode |
bigcode/starcoder |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
T5 |
google/flan-t5-xl |
🟩 |
🟩 |
- 🟨 |
+ 🟩 |
🟩 |
|
@@ -372,9 +390,9 @@ Verified for single instance mode🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Mixtral |
@@ -383,34 +401,34 @@ Verified for single instance mode |
🟩 |
- 🟨 |
+ 🟩 |
Stablelm |
stabilityai/stablelm-2-1_6b |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Qwen |
Qwen/Qwen-7B-Chat |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Qwen |
Qwen/Qwen2-7B |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
LLaVA |
@@ -436,7 +454,7 @@ Verified for single instance mode🟩 |
|
- 🟨 |
+ 🟩 |
|
@@ -446,43 +464,43 @@ Verified for single instance mode🟩 |
🟩 |
- 🟨 |
+ 🟩 |
Phi |
microsoft/Phi-3-mini-4k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Phi |
microsoft/Phi-3-mini-128k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Phi |
microsoft/Phi-3-medium-4k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Phi |
microsoft/Phi-3-medium-128k-instruct |
🟩 |
🟩 |
- 🟨 |
🟩 |
- 🟨 |
+ 🟩 |
+ 🟩 |
Whisper |
@@ -494,11 +512,7 @@ Verified for single instance mode
-
-
- 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).
- 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).
-
+
Verified for distributed inference mode via DeepSpeed
@@ -547,6 +561,18 @@ Verified for distributed inference mode via DeepSpeed🟩 |
+
+ LLAMA |
+ meta-llama/Llama-3.2-3B-Instruct |
+ 🟩 |
+ 🟩 |
+
+
+ LLAMA |
+ meta-llama/Llama-3.2-11B-Vision-Instruct |
+ 🟩 |
+ 🟩 |
+
GPT-J |
EleutherAI/gpt-j-6b |
@@ -556,13 +582,13 @@ Verified for distributed inference mode via DeepSpeedGPT-NEOX |
EleutherAI/gpt-neox-20b |
- 🟨 |
+ 🟩 |
🟩 |
DOLLY |
databricks/dolly-v2-12b |
- 🟨 |
+ 🟩 |
🟩 |
@@ -580,7 +606,7 @@ Verified for distributed inference mode via DeepSpeedOPT |
facebook/opt-30b |
- 🟨 |
+ 🟩 |
🟩 |
@@ -592,7 +618,7 @@ Verified for distributed inference mode via DeepSpeedBloom |
bigscience/bloom-1b7 |
- 🟨 |
+ 🟩 |
🟩 |
@@ -616,7 +642,7 @@ Verified for distributed inference mode via DeepSpeedBaichuan |
baichuan-inc/Baichuan-13B-Chat |
- 🟨 |
+ 🟩 |
🟩 |
@@ -710,11 +736,7 @@ Verified for distributed inference mode via DeepSpeed
-
-
- 🟩 signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).
- 🟨 signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).
-
Note: The above verified models (including other models in the same model family, like “codellama/CodeLlama-7b-hf” from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and customized linear kernels. We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future.
+Note: The above verified models (including other models in the same model family, like “codellama/CodeLlama-7b-hf” from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and customized linear kernels. We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future.
Please check LLM best known practice for instructions to install/setup environment and example scripts.
@@ -853,7 +875,7 @@ Distributed Inference using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html b/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html
index 67a840168..3f7e17c55 100644
--- a/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html
+++ b/cpu/2.5.0+cpu/tutorials/llm/llm_optimize.html
@@ -271,7 +271,7 @@ Distributed Inference with DeepSpeed using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance.html b/cpu/2.5.0+cpu/tutorials/performance.html
index 3035ceb5a..0ed45be76 100644
--- a/cpu/2.5.0+cpu/tutorials/performance.html
+++ b/cpu/2.5.0+cpu/tutorials/performance.html
@@ -1038,7 +1038,7 @@ Hardware Configuration using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html b/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html
index 60c029aa6..c4f061b05 100644
--- a/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html
+++ b/cpu/2.5.0+cpu/tutorials/performance_tuning/launch_script.html
@@ -835,7 +835,7 @@ GNU OpenMP Library using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html b/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html
index 3bd16b73c..d21bff958 100644
--- a/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html
+++ b/cpu/2.5.0+cpu/tutorials/performance_tuning/torchserve.html
@@ -462,7 +462,7 @@ Performance Boost with Intel® Extension for PyTorch* and LauncherSphinx using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html b/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html
index 1c2ef6a06..cac1039fc 100644
--- a/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html
+++ b/cpu/2.5.0+cpu/tutorials/performance_tuning/tuning_guide.html
@@ -366,7 +366,7 @@ OneDNN primitive cache using a
theme
provided by Read the Docs.
-
+
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD),
http://opensource.org/licenses/0BSD.
diff --git a/cpu/2.5.0+cpu/tutorials/releases.html b/cpu/2.5.0+cpu/tutorials/releases.html
index 16fe282ca..8759d1240 100644
--- a/cpu/2.5.0+cpu/tutorials/releases.html
+++ b/cpu/2.5.0+cpu/tutorials/releases.html
@@ -58,101 +58,105 @@
Large Language Models (LLM)
Performance
Releases