QNN Compilation Support (#171)

QNN Compilation Support in Command Line Interface commands 1. Infer/Compile API Changes to include --enable_qnn [Optional QNN Config File] 2. Added qnn_config.json file format. 3. Added generate_qnn_network_specialization_config.py to create custom_io_config.yaml file for QNN Compilation Step. 4. Modified utils/constants.py to include QnnConstants required to support QNN compilation. 5. Updated quick_start.md to include QNN Compilation steps. 6. Added QNN Compilation utilities in _utils.py 7. Added Unit Tests for QNN Compilation path Signed-off-by: Shubham Agrawal <[email protected]>
quic · Dec 18, 2024 · dc2c509 · dc2c509
1 parent 1d7c624
commit dc2c509
Show file tree

Hide file tree

Showing 14 changed files with 880 additions and 44 deletions.
diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
@@ -82,6 +82,20 @@
         action="store_true",
         help="If passed, this option allows MXINT8 compression of MDP IO traffic",
     )
+    parser.add_argument(
+        "--enable_qnn",
+        "--enable-qnn",
+        action="store_true",
+        default=False,
+        help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
+             If not provided, the default configuration will be used.\
+             Sample Config: QEfficient/cloud/compile/qnn_config.json",
+    )
+    parser.add_argument(
+        "qnn_config",
+        nargs="?",
+        type=str,
+    )
     # FIXME(ochougul): Allow extra compilation arguments
     args = parser.parse_args()
     QEfficient.compile(**vars(args))
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -36,6 +36,8 @@ def main(
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
     allow_mxint8_mdp_io: bool = False,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
 ) -> None:
     """
     1. Check if compiled qpc for given config already exists, if it does jump to execute, else
@@ -62,6 +64,8 @@ def main(
         :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
         :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+        :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
+        :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
 
     .. code-block:: bash
 
@@ -76,7 +80,17 @@ def main(
     )
 
     qpc_dir_path = get_qpc_dir_path(
-        model_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size
+        model_name,
+        num_cores,
+        mos,
+        batch_size,
+        prompt_len,
+        ctx_len,
+        mxfp6,
+        mxint8,
+        device_group,
+        full_batch_size,
+        enable_qnn=enable_qnn,
     )
 
     # Handle qpc generation
@@ -107,6 +121,8 @@ def main(
             device_group=device_group,
             full_batch_size=full_batch_size,
             allow_mxint8_mdp_io=allow_mxint8_mdp_io,
+            enable_qnn=enable_qnn,
+            qnn_config=qnn_config,
         )
 
     #########
@@ -206,6 +222,20 @@ def main(
         action="store_true",
         help="If passed, this option allows MXINT8 compression of MDP IO traffic",
     )
+    parser.add_argument(
+        "--enable_qnn",
+        "--enable-qnn",
+        action="store_true",
+        default=False,
+        help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
+             If not provided, the default configuration will be used.\
+             Sample Config: QEfficient/cloud/compile/qnn_config.json",
+    )
+    parser.add_argument(
+        "qnn_config",
+        nargs="?",
+        type=str,
+    )
 
     args = parser.parse_args()
     if args.verbose:

diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
@@ -12,6 +12,7 @@
 import warnings
 from typing import List, Optional, Tuple
 
+from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.utils.logging_utils import logger
 
 
@@ -133,6 +134,8 @@ def compile(
     custom_io_file_path: Optional[str] = None,
     full_batch_size: Optional[int] = None,
     allow_mxint8_mdp_io: Optional[bool] = False,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
     **kwargs,
 ) -> str:
     """
@@ -157,6 +160,8 @@ def compile(
         :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
         :custom_io_file_path (str): Path to ``customIO`` file (formatted as a string). ``Defaults to None.``
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
+        :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
+        :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
 
     Returns:
         :str: Path to compiled ``qpc`` package.
@@ -175,29 +180,47 @@ def compile(
         full_batch_size=full_batch_size,
     )
 
-    # Select the customIO config based on the mx flag.
-    custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
-
-    if custom_io_file_path is None:
-        custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
-
-    if not os.path.isfile(custom_io_file_path):
-        raise FileNotFoundError(
-            f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
+    if enable_qnn:
+        qpc_path = qnn_compile(
+            onnx_path=onnx_path,
+            qpc_path=qpc_path,
+            num_cores=num_cores,
+            batch_size=batch_size,
+            prompt_len=prompt_len,
+            ctx_len=ctx_len,
+            mxfp6=mxfp6,
+            mxint8=mxint8,
+            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
+            aic_enable_depth_first=aic_enable_depth_first,
+            mos=mos,
+            device_group=device_group,
+            full_batch_size=full_batch_size,
+            qnn_config=qnn_config,
         )
-
-    _, qpc_path = compile_kv_model_on_cloud_ai_100(
-        onnx_path=onnx_path,
-        specializations_json=specialization_json_path,
-        num_cores=num_cores,
-        custom_io_path=custom_io_file_path,
-        base_path=qpc_path,
-        mxfp6=mxfp6,
-        aic_enable_depth_first=aic_enable_depth_first,
-        allow_mxint8_mdp_io=allow_mxint8_mdp_io,
-        mos=mos,
-        device_group=device_group,
-    )
-
-    logger.info(f"Compiled QPC files can be found here: {qpc_path}")
+        logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}")
+    else:
+        # Select the customIO config based on the mx flag.
+        custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
+
+        if custom_io_file_path is None:
+            custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+
+        if not os.path.isfile(custom_io_file_path):
+            raise FileNotFoundError(
+                f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API"
+            )
+
+        _, qpc_path = compile_kv_model_on_cloud_ai_100(
+            onnx_path=onnx_path,
+            specializations_json=specialization_json_path,
+            num_cores=num_cores,
+            custom_io_path=custom_io_file_path,
+            base_path=qpc_path,
+            mxfp6=mxfp6,
+            aic_enable_depth_first=aic_enable_depth_first,
+            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
+            mos=mos,
+            device_group=device_group,
+        )
+        logger.info(f"Compiled QPC files can be found here: {qpc_path}")
     return qpc_path