diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py index 4450aad19..f7c5e8a9f 100644 --- a/QEfficient/cloud/compile.py +++ b/QEfficient/cloud/compile.py @@ -82,6 +82,20 @@ action="store_true", help="If passed, this option allows MXINT8 compression of MDP IO traffic", ) + parser.add_argument( + "--enable_qnn", + "--enable-qnn", + action="store_true", + default=False, + help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\ + If not provided, the default configuration will be used.\ + Sample Config: QEfficient/cloud/compile/qnn_config.json", + ) + parser.add_argument( + "qnn_config", + nargs="?", + type=str, + ) # FIXME(ochougul): Allow extra compilation arguments args = parser.parse_args() QEfficient.compile(**vars(args)) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 870005c91..0ba0961e3 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -36,6 +36,8 @@ def main( cache_dir: Optional[str] = None, hf_token: Optional[str] = None, allow_mxint8_mdp_io: bool = False, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, ) -> None: """ 1. Check if compiled qpc for given config already exists, if it does jump to execute, else @@ -62,6 +64,8 @@ def main( :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` .. code-block:: bash @@ -76,7 +80,17 @@ def main( ) qpc_dir_path = get_qpc_dir_path( - model_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size + model_name, + num_cores, + mos, + batch_size, + prompt_len, + ctx_len, + mxfp6, + mxint8, + device_group, + full_batch_size, + enable_qnn=enable_qnn, ) # Handle qpc generation @@ -107,6 +121,8 @@ def main( device_group=device_group, full_batch_size=full_batch_size, allow_mxint8_mdp_io=allow_mxint8_mdp_io, + enable_qnn=enable_qnn, + qnn_config=qnn_config, ) ######### @@ -206,6 +222,20 @@ def main( action="store_true", help="If passed, this option allows MXINT8 compression of MDP IO traffic", ) + parser.add_argument( + "--enable_qnn", + "--enable-qnn", + action="store_true", + default=False, + help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\ + If not provided, the default configuration will be used.\ + Sample Config: QEfficient/cloud/compile/qnn_config.json", + ) + parser.add_argument( + "qnn_config", + nargs="?", + type=str, + ) args = parser.parse_args() if args.verbose: diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index f98cf1b58..ba7c90a97 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -12,6 +12,7 @@ import warnings from typing import List, Optional, Tuple +from QEfficient.compile.qnn_compiler import compile as qnn_compile from QEfficient.utils.logging_utils import logger @@ -133,6 +134,8 @@ def compile( custom_io_file_path: Optional[str] = None, full_batch_size: Optional[int] = None, allow_mxint8_mdp_io: Optional[bool] = False, + enable_qnn: Optional[bool] = False, + qnn_config: Optional[str] = None, **kwargs, ) -> str: """ @@ -157,6 +160,8 @@ def compile( :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` :custom_io_file_path (str): Path to ``customIO`` file (formatted as a string). ``Defaults to None.`` :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.`` + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` Returns: :str: Path to compiled ``qpc`` package. @@ -175,29 +180,47 @@ def compile( full_batch_size=full_batch_size, ) - # Select the customIO config based on the mx flag. - custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml" - - if custom_io_file_path is None: - custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) - - if not os.path.isfile(custom_io_file_path): - raise FileNotFoundError( - f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API" + if enable_qnn: + qpc_path = qnn_compile( + onnx_path=onnx_path, + qpc_path=qpc_path, + num_cores=num_cores, + batch_size=batch_size, + prompt_len=prompt_len, + ctx_len=ctx_len, + mxfp6=mxfp6, + mxint8=mxint8, + allow_mxint8_mdp_io=allow_mxint8_mdp_io, + aic_enable_depth_first=aic_enable_depth_first, + mos=mos, + device_group=device_group, + full_batch_size=full_batch_size, + qnn_config=qnn_config, ) - - _, qpc_path = compile_kv_model_on_cloud_ai_100( - onnx_path=onnx_path, - specializations_json=specialization_json_path, - num_cores=num_cores, - custom_io_path=custom_io_file_path, - base_path=qpc_path, - mxfp6=mxfp6, - aic_enable_depth_first=aic_enable_depth_first, - allow_mxint8_mdp_io=allow_mxint8_mdp_io, - mos=mos, - device_group=device_group, - ) - - logger.info(f"Compiled QPC files can be found here: {qpc_path}") + logger.info(f"QNN Compiled QPC files can be found here: {qpc_path}") + else: + # Select the customIO config based on the mx flag. + custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml" + + if custom_io_file_path is None: + custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name) + + if not os.path.isfile(custom_io_file_path): + raise FileNotFoundError( + f"Custom IO file {custom_io_file_name} is not present at the expected path {custom_io_file_path}. Please pass the correct file path or rerun infer/export API" + ) + + _, qpc_path = compile_kv_model_on_cloud_ai_100( + onnx_path=onnx_path, + specializations_json=specialization_json_path, + num_cores=num_cores, + custom_io_path=custom_io_file_path, + base_path=qpc_path, + mxfp6=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + allow_mxint8_mdp_io=allow_mxint8_mdp_io, + mos=mos, + device_group=device_group, + ) + logger.info(f"Compiled QPC files can be found here: {qpc_path}") return qpc_path diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py new file mode 100644 index 000000000..307deca19 --- /dev/null +++ b/QEfficient/compile/qnn_compiler.py @@ -0,0 +1,395 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import os +import shutil +from typing import List, Optional + +from QEfficient.utils._utils import create_json, execute_command, load_json +from QEfficient.utils.constants import QnnConstants +from QEfficient.utils.generate_qnn_network_specialization_config import fetch_nodes_info +from QEfficient.utils.logging_utils import logger + + +class QNN: + """ + The QNN class is designed for providing QNN compilation support for exported ONNX models. + This class enables use of QNN (Qualcomm Neural Network) sdk for compiling and running ml models on target device. + + """ + + def __init__( + self, + onnx_path: str, + qpc_path: str, + num_cores: int, + custom_io_path: str, + device_group: Optional[List[int]] = None, + compiler_enable_depth_first: bool = False, + compiler_max_out_channel_split: int = -1, + batch_size: int = 1, + prompt_len: int = 32, + ctx_len: int = 128, + compiler_mxfp6_matmul_weights: bool = True, + qnn_target: str = QnnConstants.TARGET, + qnn_config_path: Optional[str] = None, + **kwargs, + ) -> None: + self.onnx_path = onnx_path + self.qpc_path = qpc_path + self.num_cores = num_cores + self.device_group = device_group + self.compiler_enable_depth_first = compiler_enable_depth_first + self.compiler_max_out_channel_split = compiler_max_out_channel_split + self.batch_size = batch_size + self.prompt_len = prompt_len + self.ctx_len = ctx_len + self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights + self.qnn_config_path = qnn_config_path + self.custom_io_path = custom_io_path + self.dlc_model_path = os.path.join(qpc_path, f"{QnnConstants.MODEL_NAME}.dlc") + self.qnn_target = qnn_target + self.qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME) + if not self.qnn_sdk_path: + raise EnvironmentError( + f"QNN_SDK_PATH {self.qnn_sdk_path} is not set. Please set {QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME}" + ) + + # Handle additional keyword arguments + for key, value in kwargs.items(): + setattr(self, key, value) + + # Parse qnn_config file if present. + self.qnn_config = None + if self.qnn_config_path: + self.parse_qnn_config() + + def check_extension_arg(self, ext_arg_key, ext_arg_value, immutable_arg_list): + """ + Checks if the passed compile stage extension arguments are valid or not. + Raises an AttributeError if any immutable argument in present in the extension argument value. + + ``Mandatory`` Args: + :ext_arg_key (str): Extension argument key. + :ext_arg_value (str): Extension argument value as present in the passed qnn_config.json + :immutable_arg_list (List): List containing parameters which can not be modified using qnn_config.json + + """ + + immutable_param = [param for param in immutable_arg_list if param in ext_arg_value] + if immutable_param: + raise AttributeError( + f"Immutable Parameters {immutable_param} found in {ext_arg_key}. Please remove {immutable_param} from {ext_arg_key}" + ) + + def parse_qnn_config(self): + """ + Parsed qnn_config.json file passed by the user for QNN Configuration and stores the key, value pair in class object. + + """ + config_data = load_json(self.qnn_config_path) + + self.qnn_config = {} + # Copy key-value pairs to the class object + for key, value in config_data.items(): + if key == QnnConstants.CONVERTOR_ARGS_EXTENSION_STR: + self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONVERTOR_ARGS) + if key == QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR: + self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONTEXT_BIN_GEN_ARGS) + self.qnn_config[key] = value + + def create_qnn_tensor_slicing_json(self) -> str: + """ + Creates tensor_slicing.json file if device_group contains more than 1 device. + + Returns: + :str: Path to tensor_slicing.json file. + """ + tensor_slicing = { + "connections": [{"devices": list(range(len(self.device_group))), "type": "p2p"}], + "partitions": [ + { + "name": "Partition0", + "devices": [{"deviceId": device} for device in range(len(self.device_group))], + } + ], + } + tensor_slicing_json_path = os.path.join(self.qpc_path, "tensor_slicing.json") + create_json(tensor_slicing_json_path, tensor_slicing) + return tensor_slicing_json_path + + def create_qnn_compile_backend_json(self) -> str: + """ + Creates qnn_compile_backend.json file containing qnn_compilation_backend parameters. + If qnn_config.json file is passed, default values will be over-written. + + Returns: + :str: Path to qnn_compile_backend.json file. + """ + qnn_compile_backend = { + "compiler_compilation_target": QnnConstants.COMPILER_COMPILATION_TARGET, + "compiler_hardware_version": QnnConstants.COMPILER_HARDWARE_VERSION, + "compiler_convert_to_FP16": QnnConstants.COMPILER_CONVERT_TO_FP16, + "compiler_retained_state": QnnConstants.COMPILER_RETAINED_STATE, + "graph_names": QnnConstants.GRAPH_NAMES, + "compiler_enable_depth_first": self.compiler_enable_depth_first, + "compiler_mxfp6_matmul_weights": self.compiler_mxfp6_matmul_weights, + "compiler_num_of_cores": self.num_cores, + "compiler_do_DDR_to_multicast": QnnConstants.COMPILER_DO_DDR_TO_MULTICAST, + "compiler_perfWarnings": QnnConstants.COMPILER_PERF_WARNINGS, + "compiler_printDDRStats": QnnConstants.COMPILER_PRINT_DDR_STATS, + "compiler_printPerfMetrics": QnnConstants.COMPILER_PRINT_PERF_METRICS, + "compiler_stat_level": QnnConstants.COMPILER_STAT_LEVEL, + "compiler_stats_batch_size": QnnConstants.COMPILER_STATS_BATCH_SIZE, + "compiler_time_passes": QnnConstants.COMPILER_TIME_PASSES, + } + if self.compiler_max_out_channel_split > 0: + qnn_compile_backend["compiler_max_out_channel_split"] = str(self.compiler_max_out_channel_split) + + if self.device_group is not None and len(self.device_group) > 1: + qnn_compile_backend["compiler_mdp_load_partition_config"] = self.create_qnn_tensor_slicing_json() + + if self.qnn_config and QnnConstants.QNN_COMPILATION_BACKEND_STR in self.qnn_config: + for key, value in self.qnn_config[QnnConstants.QNN_COMPILATION_BACKEND_STR].items(): + qnn_compile_backend[key] = value + + qnn_compile_backend_json_path = os.path.join(self.qpc_path, "qnn_compile_backend.json") + create_json(qnn_compile_backend_json_path, qnn_compile_backend) + return qnn_compile_backend_json_path + + def create_qnn_compiler_config_json(self) -> str: + """ + Creates qnn_compiler_config.json file containing path to qnn_compile_backend.json file & shared_library_path. + Config file is passed to QNN context-binary-generator. + + Returns: + :str: Path to qnn_compiler_config.json file. + """ + qnn_compiler_config = { + "backend_extensions": { + "config_file_path": self.create_qnn_compile_backend_json(), + "shared_library_path": QnnConstants.QNN_CONTEXT_LIB_NET_RUN_EXTENSIONS.format( + self.qnn_sdk_path, self.qnn_target + ), + } + } + qnn_compiler_config_json_path = os.path.join(self.qpc_path, "qnn_compiler_config.json") + create_json(qnn_compiler_config_json_path, qnn_compiler_config) + return qnn_compiler_config_json_path + + def compile(self) -> str: + """ + Compiles the given ``ONNX`` model during object creation using QNN compiler and saves the compiled ``qpc`` package at ``qpc_path``. + - Creates convertor command and convert onnx model to model.dlc using qairt-convertor + - command line arguments and qnn_config.json (if provided) are used to create qnn_compiler_config.json for context-binary-generator + - model.dlc from convertor stage is passed into context-binary-generator command to create programqpc.bin. + + Returns: + :str: Path to compiled ``qpc`` package. + """ + if not ( + self.qnn_config + and (QnnConstants.SKIP_QNN_CONVERTOR_STEP_STR in self.qnn_config) + and self.qnn_config[QnnConstants.SKIP_QNN_CONVERTOR_STEP_STR] + ): + converter_cmd = self.converter() + execute_command("convertor", converter_cmd, self.qpc_path) + + if not os.path.isfile(self.dlc_model_path): + raise FileNotFoundError( + f"file {self.dlc_model_path} needs to exist in the qpc_path{self.qpc_path}. Please rerun infer/compile Api" + ) + + self.qnn_binary_dir = os.path.join(self.qpc_path, "qpcs") + if os.path.isdir(self.qnn_binary_dir): + shutil.rmtree(self.qnn_binary_dir) + os.makedirs(self.qnn_binary_dir) + + ctx_bin_cmd = self.generate_context_binary() + execute_command("context_binary", ctx_bin_cmd, self.qpc_path) + + print("\n===================== Compilation Done! =====================\n") + return self.qnn_binary_dir + + def converter(self) -> str: + """ + Creates QNN convertor command using provided options. + + IMMUTABLE parameters which can not be overridden by the user using qnn_config.json: + :input_network (str): Generated ``ONNX`` Model Path. + :output_path (str): Path to generated DLC file, which is provided qpc_path/model.dlc + :io_config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py + :float_bias_bitwidth (int): Bitwidth to use for float bias tensor + :float_bitwidth (int): Converts the graph to the specified float bitwidth, either 32 or 16(Default). + :keep_int64_inputs(flag): Passed by default. + + CONVERTOR_ARGS_EXTENSION passed in qnn_config.json is appended to the command created. + + Returns: + :str: QNN Convertor command. + """ + converter_tool = QnnConstants.QAIRT_CONVERTER.format(self.qnn_sdk_path, self.qnn_target) + + cmd = ( + f"{converter_tool} --input_network {self.onnx_path} " + f"--output_path {self.dlc_model_path} " + f"--io_config {self.custom_io_path} " + f"--float_bias_bitwidth {QnnConstants.FLOAT_BIAS_BITWIDTH} " + f"--float_bitwidth {QnnConstants.FLOAT_BITWIDTH} " + ) + # Add default arguments. + cmd += QnnConstants.CONVERTOR_DEFAULT_ARGS + + if self.qnn_config and QnnConstants.CONVERTOR_ARGS_EXTENSION_STR in self.qnn_config: + cmd += self.qnn_config[QnnConstants.CONVERTOR_ARGS_EXTENSION_STR] + + return cmd + + def generate_context_binary(self) -> str: + """ + Creates QNN context-binary-generator command using provided options. + + IMMUTABLE parameters which can not be modified by the user using qnn_config.json: + :binary_file (str): QNN Binary Graph name to be generated (qnngraph.serialized). + :backend_binary (str): Path to generated QPC binary file, which is provided qpc_path/qpcs/programqpc.bin + :output_dir (str): Path to store generated Binaries (qpc_path/qpcs/). + :model (str): Path to the file containing a QNN network. + :dlc_path (str): Path to DLC file generated by QNN-Convertor. + :config_file(str): Path to created qnn_compiler_config.json containing qnn_compile_backend.json & shared_library_path. + + Configurable parameters: + :log_level(str): ``Configurable`` Default(error). + + CONTEXT_BIN_ARGS_EXTENSION passed in qnn_config.json is appended to the command created. + + Returns: + :str: QNN Context Binary Generator command. + """ + binary_gen_tool = QnnConstants.QNN_CONTEXT_BIN.format(self.qnn_sdk_path, self.qnn_target) + backend_lib = QnnConstants.QNN_CONTEXT_LIB_BACKEND.format(self.qnn_sdk_path, self.qnn_target) + model_lib = QnnConstants.QNN_CONTEXT_LIB_MODEL.format(self.qnn_sdk_path, self.qnn_target) + config_file_path = self.create_qnn_compiler_config_json() + + cmd = ( + f"{binary_gen_tool} --binary_file {QnnConstants.CONTEXT_BIN_NAME} " + f"--backend_binary {QnnConstants.CONTEXT_BIN_QPC_NAME} " + f"--output_dir {self.qnn_binary_dir} " + f"--backend {backend_lib} " + f"--model {model_lib} " + f"--dlc_path {self.dlc_model_path} " + f"--config_file {config_file_path} " + ) + + if self.qnn_config and QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR in self.qnn_config: + if "--log_level " not in self.qnn_config[QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR]: + cmd += f"--log_level {QnnConstants.LOG_LEVEL} " + cmd += self.qnn_config[QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR] + else: + cmd += f"--log_level {QnnConstants.LOG_LEVEL} " + + return cmd + + def quantize(self): + raise NotImplementedError("QNN Quantization is not supported") + + def execute(self): + raise NotImplementedError("QNN Execution is not supported") + + def generate_profiling(self): + raise NotImplementedError("QNN profiling is not supported") + + +def compile( + onnx_path: str, + qpc_path: str, + num_cores: int, + device_group: Optional[List[int]] = None, + aic_enable_depth_first: bool = False, + mos: int = -1, + batch_size: int = 1, + prompt_len: int = 32, + ctx_len: int = 128, + mxfp6: bool = True, + mxint8: bool = False, + allow_mxint8_mdp_io: Optional[bool] = False, + full_batch_size=None, + qnn_config: Optional[str] = None, + **kwargs, +) -> str: + """ + Compiles the given ``ONNX`` model using QNN compiler and saves the compiled ``qpc`` package at ``qpc_path``. + Generates model.dlc during convertor stage, qnn_compile_backend.json for backend parameters of context-binary-generator. + Generates tensor-slicing configuration if multiple devices are passed in ``device_group``. + + ``Mandatory`` Args: + :onnx_path (str): Generated ``ONNX`` Model Path. + :qpc_path (str): Path for saving compiled qpc binaries. + :num_cores (int): Number of cores to compile the model on. + ``Optional`` Args: + :device_group (List[int]): Used for finding the number of devices to compile for. + :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.`` + :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.`` + :batch_size (int): Batch size to compile the model for. ``Defaults to 1.`` + :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None`` + :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32`` + :ctx_len (int): Maximum context length to compile the model. ``Defaults to 128`` + :mxfp6 (bool): Enable compilation for ``MXFP6`` precision. ``Defaults to True.`` + :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.`` + :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.`` + :qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.`` + + Returns: + :str: Path to compiled ``qpc`` package. + """ + + if kwargs: + logger.warning("Extra arguments to QNN compilation are not supported as of now!") + + raise NotImplementedError("Can't handle extra compilation args now!") + + if allow_mxint8_mdp_io: + logger.warning("QNN doesn't support allow_mxint8_mdp_io. Bypassing the value passed for allow_mxint8_mdp_io") + + if mxint8: + logger.warning("QNN doesn't support mxint8. Bypassing the value passed for mxint8") + + os.makedirs(qpc_path, exist_ok=True) + + # Created custom_io_config.yaml file for QNN-Convertor stage. + # TODO To make custom_io_config.yaml configurable as not all models need it. + custom_io_file_path = os.path.join(qpc_path, "custom_io_config.yaml") + fetch_nodes_info( + onnx_graph_path=onnx_path, + batch_size=batch_size, + sequence_length=prompt_len, + context_length=ctx_len, + file_path=custom_io_file_path, + full_batch_size=full_batch_size, + ) + + if not os.path.isfile(custom_io_file_path): + raise FileNotFoundError( + f"file {custom_io_file_path} needs to exist in the qpc_path for Compilation. Please rerun infer/compile Api" + ) + + qnn_obj = QNN( + onnx_path=onnx_path, + qpc_path=qpc_path, + num_cores=num_cores, + device_group=device_group, + qnn_config_path=qnn_config, + custom_io_path=custom_io_file_path, + compiler_enable_depth_first=aic_enable_depth_first, + compiler_max_out_channel_split=mos, + batch_size=batch_size, + prompt_len=prompt_len, + ctx_len=ctx_len, + compiler_mxfp6_matmul_weights=mxfp6, + ) + + compiled_binary_path = qnn_obj.compile() + return compiled_binary_path diff --git a/QEfficient/compile/qnn_config.json b/QEfficient/compile/qnn_config.json new file mode 100644 index 000000000..18f12dd9a --- /dev/null +++ b/QEfficient/compile/qnn_config.json @@ -0,0 +1,11 @@ +{ + "convertor_args_extension": "", + "context_binary_generator_args_extension": "--log_level debug", + "qnn_compilation_backend": + { + "compiler_printDDRStats": false, + "compiler_printPerfMetrics": false, + "compiler_stat_level": 10 + }, + "SKIP_QNN_CONVERTOR_STEP": false +} \ No newline at end of file diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 29384d008..2729267d6 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -5,8 +5,10 @@ # # ----------------------------------------------------------------------------- +import json import os -from typing import List, Optional, Tuple, Union +import subprocess +from typing import Any, Dict, List, Optional, Tuple, Union import requests from huggingface_hub import login, snapshot_download @@ -196,10 +198,13 @@ def get_qpc_dir_path( device_group, full_batch_size, num_speculative_tokens: Optional[int] = None, + enable_qnn: Optional[bool] = False, ): # Create a unique directory name for the QPC model based on all parameters qpc_base_dir_name = ( - f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos" + "qpc" + + f"{'_qnn_' if enable_qnn else '_'}" + + f"{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos" + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}" + f"{f'_{num_speculative_tokens}nst_' if num_speculative_tokens is not None else ''}" + f"{len(device_group) if device_group is not None else 1}" @@ -317,3 +322,75 @@ def get_num_layers_from_config(config): raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.") return n_layer + + +def execute_command(process: str, command: str, output_file_path: Optional[str] = None): + """ + Executes the give command using subprocess. + + ``Mandatory`` Args: + :process (str): Process name for which command is executed. + :command (str): Command to be executed on shell. + ``Optional`` Args: + :output_file_path (str): If provided stdout & stderr for the executed command will be dumped to a file. ``Defaults to None.`` + + """ + print(f"Running {process} command : \n {command}") + try: + result = subprocess.run(command, capture_output=True, text=True, shell=True) + except Exception as e: + print("Execution failed: %s", e) + + if result.returncode != 0: + raise RuntimeError(f"{process} failed Failed!!\n\nSTDOUT\n{result.stdout}\n\nSTDERR\n{result.stderr}") + else: + if output_file_path: + stdout_path = os.path.join(output_file_path, f"{process}_stdout.txt") + stderr_path = os.path.join(output_file_path, f"{process}_stderr.txt") + # Write the output to a file + try: + with open(stdout_path, "w") as file: + file.write(result.stdout) + except Exception as e: + print(f"Failed to create {stdout_path}: {e}") + try: + with open(stderr_path, "w") as file: + file.write(result.stderr) + except Exception as e: + print(f"Failed to create {stderr_path}: {e}") + + +def load_json(file_path: str) -> Dict[Any, Any]: + """ + Opens the given JSON file, load and return the JSON object. + + ``Mandatory`` Args: + :file_path (str): JSON File to be opened. + + Return: + JSON Object from the given file. + + """ + try: + # Load the JSON config file + with open(file_path, "r") as file: + config_data = json.load(file) + except Exception as e: + raise ValueError(f"Failed to load json object from {file_path}: {e}") + return config_data + + +def create_json(file_path: str, json_data: object): + """ + Creates a JSON file with provided JSON data. + + ``Mandatory`` Args: + :file_path (str): JSON File to be created. + :json_data (object): JSON Data Object to be populated inside the created file. + + """ + try: + with open(file_path, "w") as file: + json.dump(json_data, file, indent=4) + except Exception as e: + print(f"Failed to create JSON File {file_path}: {e}") diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 4a3ba3ff3..bfbac905f 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import os +from dataclasses import dataclass UTILS_DIR = os.path.dirname(os.path.abspath(__file__)) QEFF_DIR = os.path.dirname(UTILS_DIR) @@ -62,3 +63,75 @@ class Constants: MAX_QPC_LIMIT = 30 MAX_RETRIES = 5 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download NUM_SPECULATIVE_TOKENS = 2 + + +@dataclass +class QnnConstants: + # QNN PATH to be read from environment variable. + QNN_SDK_PATH_ENV_VAR_NAME = "QNN_SDK_ROOT" + + # QNN Compilation tools + QAIRT_CONVERTER = "{}/bin/{}/qairt-converter" + QNN_CONTEXT_BIN = "{}/bin/{}/qnn-context-binary-generator" + + # QNN Libraries required for compilation + QNN_CONTEXT_LIB_BACKEND = "{}/lib/{}/libQnnAicCC.so" + QNN_CONTEXT_LIB_MODEL = "{}/lib/{}/libQnnModelDlc.so" + QNN_CONTEXT_LIB_NET_RUN_EXTENSIONS = "{}/lib/{}/libQnnAicNetRunExtensions.so" + + # QNN Compilation target names + MODEL_NAME = "model" + CONTEXT_BIN_NAME = "qnngraph.serialized" + CONTEXT_BIN_QPC_NAME = "programqpc.bin" + + # TARGET System Architecture + TARGET = "x86_64-linux-clang" # TODO add support in infer to be override + + # Convertor Arguments + FLOAT_BITWIDTH = 16 + FLOAT_BIAS_BITWIDTH = 32 + CONVERTOR_DEFAULT_ARGS = "--keep_int64_inputs --onnx_no_simplification " + + # Context-Binary-Generator Arguments + LOG_LEVEL = "error" + + # qnn_compilation_backend default Arguments + COMPILER_COMPILATION_TARGET = "hardware" + COMPILER_CONVERT_TO_FP16 = True + COMPILER_DO_DDR_TO_MULTICAST = True + COMPILER_HARDWARE_VERSION = "2.0" + COMPILER_PERF_WARNINGS = False + COMPILER_PRINT_DDR_STATS = False + COMPILER_PRINT_PERF_METRICS = False + COMPILER_RETAINED_STATE = True + COMPILER_STAT_LEVEL = 10 + COMPILER_STATS_BATCH_SIZE = 1 + COMPILER_TIME_PASSES = False + GRAPH_NAMES = [f"{MODEL_NAME}_configuration_1", f"{MODEL_NAME}_configuration_2"] + + # qnn_config JSON file supported Keys + CONVERTOR_ARGS_EXTENSION_STR = "convertor_args_extension" + CONTEXT_BIN_ARGS_EXTENSION_STR = "context_binary_generator_args_extension" + QNN_COMPILATION_BACKEND_STR = "qnn_compilation_backend" + SKIP_QNN_CONVERTOR_STEP_STR = "SKIP_QNN_CONVERTOR_STEP" + + IMMUTABLE_CONVERTOR_ARGS = [ + "--input_network ", + "--output_path ", + "--io_config ", + "--float_bias_bitwidth ", + "--float_bitwidth ", + "--keep_int64_inputs", + "--onnx_no_simplification", + "--onnx_defer_loading", + ] + + IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [ + "--binary_file ", + "--backend_binary ", + "--output_dir ", + "--backend ", + "--model ", + "--dlc_path ", + "--config_file ", + ] diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py new file mode 100644 index 000000000..0e5e17c08 --- /dev/null +++ b/QEfficient/utils/generate_qnn_network_specialization_config.py @@ -0,0 +1,144 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from typing import Optional + +import onnx +import yaml +from onnx import helper + +""" + The network specilization file is generated by loading the onnx graph and fecthing the graph inputs and outputs. +""" + + +def fetch_nodes_info( + onnx_graph_path: str, + batch_size: int, + sequence_length: int, + context_length: int, + file_path: str = "custom_io_config.yaml", + full_batch_size: Optional[int] = None, + decode_only: Optional[bool] = False, +) -> None: + # Load the ONNX model + onnx_model = onnx.load(onnx_graph_path) + + input_nodes = [] + input_nodes_info = [] + final_dict = {} + output_nodes = [] + output_nodes_info = [] + for node in onnx_model.graph.input: + input_nodes.append(node.name) + input_info = {} + input_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(node.type.tensor_type.elem_type)) + if "past_key" in node.name or "past_value" in node.name: + input_info["DataType"] = "float16" + + if "batch_index" in node.name: + if full_batch_size: + input_info["Shape"] = f"(1, 1), ({full_batch_size}, 1)" + else: + input_info["Shape"] = "(1, 1)" + else: + shapes = [] + for input_shape in node.type.tensor_type.shape.dim: + if input_shape.HasField("dim_value"): + shape = input_shape.dim_value + elif input_shape.HasField("dim_param"): + shape = input_shape.dim_param + else: + shape = "shape_not_found" + shapes.append(shape) + + if ( + ("batch_size" in shapes or "full_batch_size" in shapes) + and ("ctx_len" in shapes or "max_context_len" in shapes) + and len(shapes) >= 3 + ): + shapeList = [] + for shape in shapes: + if isinstance(shape, str): + if "full_batch_size" in shape: + if full_batch_size: + shapeList.append(full_batch_size) + else: + print("ERROR: Full batch size is required to generate custom_io_config.yaml") + exit() + elif "batch_size" in shape: + shapeList.append(batch_size) + elif shape in ["ctx_len", "max_context_len"]: + shapeList.append(context_length) + else: + shapeList.append(shape) + shape = str(shapeList).replace("[", "(").replace("]", ")") + elif "batch_size" in shapes and ("seq_len" in shapes or "prompt_len" in shapes): + shape_1 = ( + str( + [ + batch_size if isinstance(shape, str) and "batch_size" in shape else sequence_length + for shape in shapes + ] + ) + .replace("[", "(") + .replace("]", ")") + ) + if full_batch_size: + shape_2 = ( + str( + [ + full_batch_size if isinstance(shape, str) and "batch_size" in shape else 1 + for shape in shapes + ] + ) + .replace("[", "(") + .replace("]", ")") + ) + else: + shape_2 = ( + str([batch_size if isinstance(shape, str) and "batch_size" in shape else 1 for shape in shapes]) + .replace("[", "(") + .replace("]", ")") + ) + shape = shape_2 if decode_only else shape_1 + "," + shape_2 + elif ("batch_size" in shapes or "full_batch_size" in shapes) and ( + "ctx_len" in shapes or "max_context_len" in shapes + ): + shape = ( + str( + [ + batch_size if isinstance(shape, str) and "batch_size" in shape else context_length + for shape in shapes + ] + ) + .replace("[", "(") + .replace("]", ")") + ) + input_info["Shape"] = shape + input_nodes_info.append({"Name": node.name, "Desired Model Parameters": input_info}) + + # Prepare output tensor configuration + for output in onnx_model.graph.output: + output_nodes.append(output.name) + output_info = {} + output_info["DataType"] = str(helper.tensor_dtype_to_np_dtype(output.type.tensor_type.elem_type)) + if "past_key" in output.name or "past_value" in output.name: + output_info["DataType"] = "float16" + elif "logits" in output.name: + output_info["DataType"] = "float32" + output_nodes_info.append({"Name": output.name, "Desired Model Parameters": output_info}) + + # Combine input and output configurations + final_dict = {"Input Tensor Configuration": input_nodes_info, "Output Tensor Configuration": output_nodes_info} + + # Save the configuration to a YAML file + try: + with open(file_path, "w") as yaml_file: + yaml.dump(final_dict, yaml_file, default_flow_style=False, sort_keys=False) + except Exception as e: + print(f"Failed to create YAML File for QNN Network Specialization Configuration{file_path}: {e}") diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index ac3e9cae9..6b1173226 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -1,5 +1,5 @@ -QEfficient Library was designed with one goal: +QEfficient Library was designed with one goal: **To make onboarding of models inference straightforward for any Transformer architecture, while leveraging the complete power of Cloud AI platform** @@ -34,28 +34,38 @@ This is the single e2e CLI API, which takes `model_card` name as input along wit ```bash # Check out the options using the help python -m QEfficient.cloud.infer --help -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first ``` If executing for batch size>1, You can pass input prompts in single string but separate with pipe (|) symbol". Example below ```bash -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first ``` You can also pass path of txt file with input prompts when you want to run inference on lot of prompts, Example below, sample txt file(prompts.txt) is present in examples folder. ```bash -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` +For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters. +Without QNN Config +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn +``` + +With QNN Config +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json +```` ### QEfficient.cloud.execute You can first run `infer` API and then use `execute` to run the pre-compiled model on Cloud AI 100 cards. Once we have compiled the QPC, we can now use the precompiled QPC in execute API to run for different prompts. Make sure to pass same `--device_group` as used during infer. Refer [Execute API doc](execute_api) for more details. ```bash -python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] +python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2/qpc_16cores_1BS_32PL_128CL_1devices_mxfp6/qpcs --prompt "Once upon a time in" --device_group [0] ``` ### QEfficient.cloud.finetune @@ -70,12 +80,17 @@ For more details on finetune, checkout the subsection. You can also enable MQ, just based on the number of devices. Based on the `--device-group` as input it will create TS config on the fly. If `--device-group [0,1]` it will create TS config for 2 devices and use it for compilation, if `--device-group [0]` then TS compilation is skipped and single soc execution is enabled. ```bash -python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first -``` +python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first +``` + +For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters. +```bash +python -m QEfficient.cloud.infer --model_name Salesforce/codegen-2B-mono --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0,1] --prompt "def fibonacci(n):" --mos 2 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json +``` Above step will save the `qpc` files under `efficient-transformers/qeff_models/{model_card_name}`, you can use the execute API to run for different prompts. This will automatically pick the pre-compiled `qpc` files. ```bash -python -m QEfficient.cloud.execute --model_name Salesforce/codegen-2B-mono --qpc-path qeff_models/Salesforce/codegen-2B-mono/qpc_16cores_1BS_32PL_128CL_2devices_mxfp6/qpcs --prompt "def binary_search(array: np.array, k: int):" --device-group [0,1] +python -m QEfficient.cloud.execute --model_name Salesforce/codegen-2B-mono --qpc-path qeff_models/Salesforce/codegen-2B-mono/qpc_16cores_1BS_32PL_128CL_2devices_mxfp6/qpcs --prompt "def binary_search(array: np.array, k: int):" --device-group [0,1] ``` To disable MQ, just pass single soc like below, below step will compile the model again and reuse the `ONNX` file as only compilation argument are different from above commands. @@ -84,8 +99,13 @@ To disable MQ, just pass single soc like below, below step will compile the mode python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first ``` +For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters. +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --enable_qnn QEfficient/compile/qnn_config.json +``` + -### Continuous Batching +### Continuous Batching Users can compile a model utilizing the continuous batching feature by specifying full_batch_size in the infer and compiler APIs. If full_batch_size is not provided, the model will be compiled in the regular way. @@ -94,7 +114,14 @@ When enabling continuous batching, batch size should not be specified. Users can leverage multi-Qranium and other supported features along with continuous batching. ```bash -python -m QEfficient.cloud.infer --model_name TinyLlama/TinyLlama_v1.1 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is" --mxfp6 --mos 1 --aic_enable_depth_first --full_batch_size 4 +python -m QEfficient.cloud.infer --model_name TinyLlama/TinyLlama_v1.1 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth +theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first --full_batch_size 3 +``` + +For QNN Compilation, export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder & add --enable_qnn in the command and an optional config file if user wish to override the default parameters. +```bash +python -m QEfficient.cloud.infer --model_name TinyLlama/TinyLlama_v1.1 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth +theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first --full_batch_size 3 --enable_qnn QEfficient/compile/qnn_config.json ``` ## Python API @@ -143,6 +170,8 @@ generated_qpc_path = qeff_model.compile( num_cores=14, mxfp6=True, device_group=[0], + enable_qnn=True # if QNN Compilation path {default = False} + qnn_config = qnn_config_file_path # if QNN compilation configuration is passed {default = None}. ) ``` @@ -159,7 +188,7 @@ qeff_model.generate(prompts=["My name is"]) End to End demo examples for various models are available in **notebooks** directory. Please check them out. ### Draft-Based Speculative Decoding -Draft-based speculative decoding is a technique where a small Draft Language Model (DLM) makes `num_speculative_tokens` autoregressive speculations ahead of the Target Language Model (TLM). The objective is to predict what the TLM would have predicted if it would have been used instead of the DLM. This approach is beneficial when the autoregressive decode phase of the TLM is memory bound and thus, we can leverage the extra computing resources of our hardware by batching the speculations of the DLM as an input to TLM to validate the speculations. +Draft-based speculative decoding is a technique where a small Draft Language Model (DLM) makes `num_speculative_tokens` autoregressive speculations ahead of the Target Language Model (TLM). The objective is to predict what the TLM would have predicted if it would have been used instead of the DLM. This approach is beneficial when the autoregressive decode phase of the TLM is memory bound and thus, we can leverage the extra computing resources of our hardware by batching the speculations of the DLM as an input to TLM to validate the speculations. To export and compile both DLM/TLM, add corresponding `is_tlm` and `num_speculative_tokens` for TLM and export DLM as you would any other QEfficient LLM model: @@ -173,4 +202,4 @@ tlm.compile(num_speculative_tokens=k) dlm.compile() ``` -The `is_tlm` flag is fed during the instantiation of the model because slight changes to the ONNX graph are required. Once complete, the user can specify `num_speculative_tokens` to define the actual number of speculations that the TLM will take as input during the decode phase. As for the DLM, no new changes are required at the ONNX or compile level. \ No newline at end of file +The `is_tlm` flag is fed during the instantiation of the model because slight changes to the ONNX graph are required. Once complete, the user can specify `num_speculative_tokens` to define the actual number of speculations that the TLM will take as input during the decode phase. As for the DLM, no new changes are required at the ONNX or compile level. \ No newline at end of file diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index f1d37fe86..f05540c8a 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -74,14 +74,32 @@ pipeline { mkdir -p $PWD/cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/cli && - pytest tests -m cli --junitxml=tests/tests_log3.xml && - junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log.xml && + pytest tests -m '(cli and not qnn)' --junitxml=tests/tests_log3.xml && deactivate" ''' } } } - } + stage('QNN CLI Tests') { + steps { + timeout(time: 30, unit: 'MINUTES') { + sh ''' + docker exec ${BUILD_TAG} bash -c " + source /qnn_sdk/bin/envsetup.sh && + source /qnn_sdk/bin/envcheck -c && + cd /efficient-transformers && + . preflight_qeff/bin/activate && + mkdir -p $PWD/Qnn_cli && + export TOKENIZERS_PARALLELISM=false && + export QEFF_HOME=$PWD/Qnn_cli && + pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log4.xml && + junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log4.xml tests/tests_log.xml && + deactivate" + ''' + } + } + } + } post { always { diff --git a/tests/cloud/conftest.py b/tests/cloud/conftest.py index d6b3702af..0810afb7d 100644 --- a/tests/cloud/conftest.py +++ b/tests/cloud/conftest.py @@ -47,6 +47,8 @@ def __init__( mxint8, full_batch_size, device_group, + enable_qnn, + qnn_config, ): """ Initialization set up @@ -66,6 +68,8 @@ def __init__( param: mxint8: bool param: full_batch_size: int param: device_group: List[int] + param: enable_qnn: bool + param: qnn_config: str """ self.model_name = model_name self.num_cores = num_cores @@ -84,13 +88,15 @@ def __init__( self.mxint8 = mxint8 self.full_batch_size = full_batch_size self.device_group = device_group + self.enable_qnn = enable_qnn + self.qnn_config = qnn_config def model_card_dir(self): return str(os.path.join(QEFF_MODELS_DIR, str(self.model_name))) def qpc_base_dir_path(self): base_dir_name = str( - f"qpc_{self.num_cores}cores_{self.batch_size}bs_{self.prompt_len}pl_{self.ctx_len}cl_{self.mos}mos" + f"qpc{'_qnn_' if self.enable_qnn else '_'}{self.num_cores}cores_{self.batch_size}bs_{self.prompt_len}pl_{self.ctx_len}cl_{self.mos}mos" + f"{f'_{self.full_batch_size}fbs_' if self.full_batch_size is not None else '_'}" + f"{len(self.device_group) if self.device_group is not None else 1}" + "devices" @@ -156,6 +162,8 @@ def setup( mxint8, full_batch_size, device_group, + enable_qnn, + qnn_config, ): """ It is a fixture or shared object of all testing script within or inner folder, @@ -180,6 +188,8 @@ def setup( bool(mxint8), full_batch_size, device_group, + enable_qnn, + qnn_config, ) yield model_setup @@ -217,6 +227,8 @@ def pytest_generate_tests(metafunc): metafunc.parametrize("mxint8", json_data["mxint8"], ids=lambda x: "mxint8=" + str(x)) metafunc.parametrize("full_batch_size", json_data["full_batch_size"], ids=lambda x: "full_batch_size=" + str(x)) metafunc.parametrize("device_group", json_data["device_group"], ids=lambda x: "device_group=" + str(x)) + metafunc.parametrize("enable_qnn", json_data["enable_qnn"], ids=lambda x: "enable_qnn=" + str(x)) + metafunc.parametrize("qnn_config", json_data["qnn_config"], ids=lambda x: "qnn_config=" + str(x)) def pytest_collection_modifyitems(config, items): @@ -270,6 +282,12 @@ def pytest_collection_modifyitems(config, items): if item.module.__name__ in ["test_export", "test_compile", "test_execute", "test_infer"]: if hasattr(item, "callspec"): params = item.callspec.params + if not params["enable_qnn"] and params["qnn_config"] is not None: + item.add_marker( + pytest.mark.skip(reason="Skipping because same as enable_qnn = false and qnn_config = None") + ) + if params["enable_qnn"]: + item.add_marker(pytest.mark.qnn) if item.module.__name__ in ["test_export", "test_compile", "test_execute"]: if hasattr(item, "callspec"): diff --git a/tests/cloud/high_level_testing.json b/tests/cloud/high_level_testing.json index 83fff5ac2..fb4d7c19f 100644 --- a/tests/cloud/high_level_testing.json +++ b/tests/cloud/high_level_testing.json @@ -14,5 +14,7 @@ "mxfp6" : [1], "mxint8" : [1], "device_group" : [null], - "full_batch_size" : [null,3] + "full_batch_size" : [null,3], + "enable_qnn" : [false, true], + "qnn_config" : [null, "QEfficient/compile/qnn_config.json"] } diff --git a/tests/cloud/test_compile.py b/tests/cloud/test_compile.py index 6d06a3c99..9bfe39647 100644 --- a/tests/cloud/test_compile.py +++ b/tests/cloud/test_compile.py @@ -42,6 +42,7 @@ def test_compile(setup, mocker): mxfp6=ms.mxfp6, mxint8=ms.mxint8, full_batch_size=ms.full_batch_size, + enable_qnn=ms.enable_qnn, ) assert os.path.isdir(ms.qpc_dir_path()) diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 8cd61a050..e28c3a38a 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -52,6 +52,7 @@ def test_infer(setup, mocker): mxfp6=ms.mxfp6, mxint8=ms.mxint8, full_batch_size=ms.full_batch_size, + enable_qnn=ms.enable_qnn, ) # tokenizer check load_hf_tokenizer_spy.assert_called_once()