diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 82fc4221..2760cf52 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -21,8 +21,10 @@
 
 from QEfficient.base.onnx_transforms import OnnxTransform
 from QEfficient.base.pytorch_transforms import PytorchTransform
+from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.utils import constants
+from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
 logger = logging.getLogger(__name__)
@@ -319,3 +321,99 @@ def _compile(
 
         self.qpc_path = qpc_path
         return qpc_path
+
+    def _qnn_compile(
+        self,
+        onnx_path: Optional[str] = None,
+        compile_dir: Optional[str] = None,
+        *,
+        specializations: Optional[List[Dict[str, int]]] = None,
+        prefill_seq_len: int = 32,
+        ctx_len: int = 128,
+        batch_size: int = 1,
+        full_batch_size: Optional[int] = None,
+        mdp_ts_num_devices: int = 1,
+        num_cores: int = 16,
+        mxfp6_matmul: bool = False,
+        mxint8_kv_cache: bool = False,
+        qnn_config: Optional[str] = None,
+    ) -> str:
+        """
+        Interface for QNN compiler
+
+        Args:
+            :onnx_path (str): Onnx file to compile
+            :compile_dir (str): Directory path to compile the qpc. A suffix is added to the directory path to avoid reusing same qpc for different parameters.
+            :specializations (list): List of specializations to compile for
+            :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
+            :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
+            :batch_size (int, optional): Batch size. ``Defaults to 1``.
+            :full_batch_size (int, optional): Continuous batching batch size.
+            :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing.
+            :num_cores (int): Number of cores used to compile the model.
+            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
+            :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
+            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+        """
+        if onnx_path is None and self.onnx_path is None:
+            self.export()
+
+        onnx_path = Path(onnx_path or self.onnx_path)
+        compile_dir = Path(compile_dir or onnx_path.parent)
+        qpc_path = compile_dir / "qpc"
+        if not onnx_path.is_file():
+            raise FileNotFoundError(f"ONNX file not found at: {onnx_path}")
+
+        compile_hash = hashlib.sha256(to_hashable("qnn"))
+
+        if specializations is not None:
+            compile_hash.update(to_hashable(specializations))
+
+        if qnn_config is not None:
+            qnn_config_values = load_json(qnn_config)
+            compile_hash.update(to_hashable(qnn_config_values))
+
+        if mdp_ts_num_devices > 1:
+            compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
+
+        compile_hash.update(to_hashable({"num_cores": num_cores}))
+        compile_hash.update(to_hashable({"mxfp6_matmul": mxfp6_matmul}))
+        compile_hash.update(to_hashable({"mxint8_kv_cache": mxint8_kv_cache}))
+
+        # Check if already compiled
+        compile_hash = compile_hash.hexdigest()[:16]
+        qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
+        if qpc_path.is_dir():
+            if (qpc_path / "programqpc.bin").is_file():
+                self.qpc_path = qpc_path
+                return qpc_path
+            # Probably compilation failure last time, delete directory to start over
+            shutil.rmtree(qpc_path)
+
+        # Write specializations.json file
+        if specializations is not None:
+            specializations_json = compile_dir / "specializations.json"
+            with open(specializations_json, "w") as fp:
+                json.dump(
+                    {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
+                    fp,
+                    indent=4,
+                )
+
+        qnn_compile(
+            onnx_path=onnx_path,
+            qpc_base_path=compile_dir,
+            num_cores=num_cores,
+            device_group=list(range(mdp_ts_num_devices)),
+            batch_size=batch_size,
+            prompt_len=prefill_seq_len,
+            ctx_len=ctx_len,
+            mxfp6=mxfp6_matmul,
+            mxint8=mxint8_kv_cache,
+            full_batch_size=full_batch_size,
+            qnn_config=qnn_config,
+            qnn_binary_dir=qpc_path,
+        )
+
+        self.qpc_path = qpc_path
+        return qpc_path
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
index ba7c90a9..ae86b493 100644
--- a/QEfficient/compile/compile_helper.py
+++ b/QEfficient/compile/compile_helper.py
@@ -183,7 +183,7 @@ def compile(
     if enable_qnn:
         qpc_path = qnn_compile(
             onnx_path=onnx_path,
-            qpc_path=qpc_path,
+            qpc_base_path=qpc_path,
             num_cores=num_cores,
             batch_size=batch_size,
             prompt_len=prompt_len,
diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py
index 307deca1..ad5da976 100644
--- a/QEfficient/compile/qnn_compiler.py
+++ b/QEfficient/compile/qnn_compiler.py
@@ -25,7 +25,7 @@ class QNN:
     def __init__(
         self,
         onnx_path: str,
-        qpc_path: str,
+        qpc_base_path: str,
         num_cores: int,
         custom_io_path: str,
         device_group: Optional[List[int]] = None,
@@ -37,10 +37,11 @@ def __init__(
         compiler_mxfp6_matmul_weights: bool = True,
         qnn_target: str = QnnConstants.TARGET,
         qnn_config_path: Optional[str] = None,
+        qnn_binary_dir: Optional[str] = None,
         **kwargs,
     ) -> None:
         self.onnx_path = onnx_path
-        self.qpc_path = qpc_path
+        self.qpc_base_path = qpc_base_path
         self.num_cores = num_cores
         self.device_group = device_group
         self.compiler_enable_depth_first = compiler_enable_depth_first
@@ -50,8 +51,9 @@ def __init__(
         self.ctx_len = ctx_len
         self.compiler_mxfp6_matmul_weights = compiler_mxfp6_matmul_weights
         self.qnn_config_path = qnn_config_path
+        self.qnn_binary_dir = qnn_binary_dir
         self.custom_io_path = custom_io_path
-        self.dlc_model_path = os.path.join(qpc_path, f"{QnnConstants.MODEL_NAME}.dlc")
+        self.dlc_model_path = os.path.join(qpc_base_path, f"{QnnConstants.MODEL_NAME}.dlc")
         self.qnn_target = qnn_target
         self.qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
         if not self.qnn_sdk_path:
@@ -118,7 +120,7 @@ def create_qnn_tensor_slicing_json(self) -> str:
                 }
             ],
         }
-        tensor_slicing_json_path = os.path.join(self.qpc_path, "tensor_slicing.json")
+        tensor_slicing_json_path = os.path.join(self.qpc_base_path, "tensor_slicing.json")
         create_json(tensor_slicing_json_path, tensor_slicing)
         return tensor_slicing_json_path
 
@@ -157,7 +159,7 @@ def create_qnn_compile_backend_json(self) -> str:
             for key, value in self.qnn_config[QnnConstants.QNN_COMPILATION_BACKEND_STR].items():
                 qnn_compile_backend[key] = value
 
-        qnn_compile_backend_json_path = os.path.join(self.qpc_path, "qnn_compile_backend.json")
+        qnn_compile_backend_json_path = os.path.join(self.qpc_base_path, "qnn_compile_backend.json")
         create_json(qnn_compile_backend_json_path, qnn_compile_backend)
         return qnn_compile_backend_json_path
 
@@ -177,13 +179,13 @@ def create_qnn_compiler_config_json(self) -> str:
                 ),
             }
         }
-        qnn_compiler_config_json_path = os.path.join(self.qpc_path, "qnn_compiler_config.json")
+        qnn_compiler_config_json_path = os.path.join(self.qpc_base_path, "qnn_compiler_config.json")
         create_json(qnn_compiler_config_json_path, qnn_compiler_config)
         return qnn_compiler_config_json_path
 
     def compile(self) -> str:
         """
-        Compiles the given ``ONNX`` model during object creation using QNN compiler and saves the compiled ``qpc`` package at ``qpc_path``.
+        Compiles the given ``ONNX`` model during object creation using QNN compiler and saves the compiled ``qpc`` package at ``qnn_binary_dir``.
             - Creates convertor command and convert onnx model to model.dlc using qairt-convertor
             - command line arguments and qnn_config.json (if provided) are used to create qnn_compiler_config.json for context-binary-generator
             - model.dlc from convertor stage is passed into context-binary-generator command to create programqpc.bin.
@@ -197,20 +199,21 @@ def compile(self) -> str:
             and self.qnn_config[QnnConstants.SKIP_QNN_CONVERTOR_STEP_STR]
         ):
             converter_cmd = self.converter()
-            execute_command("convertor", converter_cmd, self.qpc_path)
+            execute_command("convertor", converter_cmd, self.qpc_base_path)
 
         if not os.path.isfile(self.dlc_model_path):
             raise FileNotFoundError(
-                f"file {self.dlc_model_path} needs to exist in the qpc_path{self.qpc_path}. Please rerun infer/compile Api"
+                f"file {self.dlc_model_path} needs to exist in the qpc_base_path{self.qpc_base_path}. Please rerun infer/compile Api"
             )
 
-        self.qnn_binary_dir = os.path.join(self.qpc_path, "qpcs")
+        if self.qnn_binary_dir is None:
+            self.qnn_binary_dir = os.path.join(self.qpc_base_path, "qpcs")
         if os.path.isdir(self.qnn_binary_dir):
             shutil.rmtree(self.qnn_binary_dir)
         os.makedirs(self.qnn_binary_dir)
 
         ctx_bin_cmd = self.generate_context_binary()
-        execute_command("context_binary", ctx_bin_cmd, self.qpc_path)
+        execute_command("context_binary", ctx_bin_cmd, self.qpc_base_path)
 
         print("\n===================== Compilation Done! =====================\n")
         return self.qnn_binary_dir
@@ -221,7 +224,7 @@ def converter(self) -> str:
 
         IMMUTABLE parameters which can not be overridden by the user using qnn_config.json:
             :input_network (str): Generated ``ONNX`` Model Path.
-            :output_path (str): Path to generated DLC file, which is provided qpc_path/model.dlc
+            :output_path (str): Path to generated DLC file, which is provided qpc_base_path/model.dlc
             :io_config (str): Path to custom_io_config.yaml file created using GenerateQNNnetworkSpecializationconfig.py
             :float_bias_bitwidth (int): Bitwidth to use for float bias tensor
             :float_bitwidth (int): Converts the graph to the specified float bitwidth, either 32 or 16(Default).
@@ -255,8 +258,8 @@ def generate_context_binary(self) -> str:
 
         IMMUTABLE parameters which can not be modified by the user using qnn_config.json:
             :binary_file (str): QNN Binary Graph name to be generated (qnngraph.serialized).
-            :backend_binary (str): Path to generated QPC binary file, which is provided qpc_path/qpcs/programqpc.bin
-            :output_dir (str): Path to store generated Binaries (qpc_path/qpcs/).
+            :backend_binary (str): Generated QPC binary file name, which is provided programqpc.bin
+            :output_dir (str): Path to store generated Binaries (qnn_binary_dir).
             :model (str): Path to the <qnn_model_name.so> file containing a QNN network.
             :dlc_path (str): Path to DLC file generated by QNN-Convertor.
             :config_file(str): Path to created qnn_compiler_config.json containing qnn_compile_backend.json & shared_library_path.
@@ -305,7 +308,7 @@ def generate_profiling(self):
 
 def compile(
     onnx_path: str,
-    qpc_path: str,
+    qpc_base_path: str,
     num_cores: int,
     device_group: Optional[List[int]] = None,
     aic_enable_depth_first: bool = False,
@@ -318,16 +321,17 @@ def compile(
     allow_mxint8_mdp_io: Optional[bool] = False,
     full_batch_size=None,
     qnn_config: Optional[str] = None,
+    qnn_binary_dir: Optional[str] = None,
     **kwargs,
 ) -> str:
     """
-    Compiles the given ``ONNX`` model using QNN compiler and saves the compiled ``qpc`` package at ``qpc_path``.
+    Compiles the given ``ONNX`` model using QNN compiler and saves the compiled ``qpc`` package at ``qnn_binary_dir``.
     Generates model.dlc during convertor stage, qnn_compile_backend.json for backend parameters of context-binary-generator.
     Generates tensor-slicing configuration if multiple devices are passed in ``device_group``.
 
     ``Mandatory`` Args:
         :onnx_path (str): Generated ``ONNX`` Model Path.
-        :qpc_path (str): Path for saving compiled qpc binaries.
+        :qpc_base_path (str): base directory for QNN compilation config & binary file.
         :num_cores (int): Number of cores to compile the model on.
     ``Optional`` Args:
         :device_group (List[int]): Used for finding the number of devices to compile for.
@@ -341,6 +345,7 @@ def compile(
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
         :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
         :qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.``
+        :qnn_binary_dir (str): Path for saving qnn binaries.
 
     Returns:
         :str: Path to compiled ``qpc`` package.
@@ -357,11 +362,11 @@ def compile(
     if mxint8:
         logger.warning("QNN doesn't support mxint8. Bypassing the value passed for mxint8")
 
-    os.makedirs(qpc_path, exist_ok=True)
+    os.makedirs(qpc_base_path, exist_ok=True)
 
     # Created custom_io_config.yaml file for QNN-Convertor stage.
     # TODO To make custom_io_config.yaml configurable as not all models need it.
-    custom_io_file_path = os.path.join(qpc_path, "custom_io_config.yaml")
+    custom_io_file_path = os.path.join(qpc_base_path, "custom_io_config.yaml")
     fetch_nodes_info(
         onnx_graph_path=onnx_path,
         batch_size=batch_size,
@@ -373,12 +378,12 @@ def compile(
 
     if not os.path.isfile(custom_io_file_path):
         raise FileNotFoundError(
-            f"file {custom_io_file_path} needs to exist in the qpc_path for Compilation. Please rerun infer/compile Api"
+            f"file {custom_io_file_path} needs to exist in the qpc_base_path for Compilation. Please rerun infer/compile Api"
         )
 
     qnn_obj = QNN(
         onnx_path=onnx_path,
-        qpc_path=qpc_path,
+        qpc_base_path=qpc_base_path,
         num_cores=num_cores,
         device_group=device_group,
         qnn_config_path=qnn_config,
@@ -389,6 +394,7 @@ def compile(
         prompt_len=prompt_len,
         ctx_len=ctx_len,
         compiler_mxfp6_matmul_weights=mxfp6,
+        qnn_binary_dir=qnn_binary_dir,
     )
 
     compiled_binary_path = qnn_obj.compile()
diff --git a/QEfficient/compile/qnn_config.json b/QEfficient/compile/qnn_config.json
index 18f12dd9..369b5598 100644
--- a/QEfficient/compile/qnn_config.json
+++ b/QEfficient/compile/qnn_config.json
@@ -3,6 +3,7 @@
     "context_binary_generator_args_extension": "--log_level debug",
     "qnn_compilation_backend":
     {
+        "compiler_enable_depth_first": true,
         "compiler_printDDRStats": false,
         "compiler_printPerfMetrics": false,
         "compiler_stat_level": 10
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 83c573f6..f565cbca 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -245,6 +245,8 @@ def compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         num_speculative_tokens: Optional[int] = None,
+        enable_qnn: bool = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -266,6 +268,8 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
+            :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.
@@ -311,28 +315,48 @@ def compile(
             decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else None
             specializations.append(decode_specialization)
 
-        # Custom IO
-        custom_io = {}
-        kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
-        for suffix in ["", "_RetainedState"]:
-            for i in range(self.num_layers):
-                for kv in ["key", "value"]:
-                    custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
-
-        return self._compile(
-            onnx_path,
-            compile_dir,
-            compile_only=True,
-            retained_state=True,
-            specializations=specializations,
-            convert_to_fp16=True,
-            mxfp6_matmul=mxfp6_matmul,
-            custom_io=custom_io,
-            mdp_ts_num_devices=num_devices,
-            num_speculative_tokens=num_speculative_tokens,
-            aic_num_cores=num_cores,
-            **compiler_options,
-        )
+        if enable_qnn:
+            if compiler_options:
+                logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")
+
+            qpc_path = self._qnn_compile(
+                onnx_path,
+                compile_dir,
+                specializations=specializations,
+                prefill_seq_len=prefill_seq_len,
+                ctx_len=ctx_len,
+                batch_size=batch_size,
+                full_batch_size=full_batch_size,
+                mdp_ts_num_devices=num_devices,
+                num_cores=num_cores,
+                mxfp6_matmul=mxfp6_matmul,
+                mxint8_kv_cache=mxint8_kv_cache,
+                qnn_config=qnn_config,
+            )
+        else:
+            # Custom IO
+            custom_io = {}
+            kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+            for suffix in ["", "_RetainedState"]:
+                for i in range(self.num_layers):
+                    for kv in ["key", "value"]:
+                        custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
+
+            qpc_path = self._compile(
+                onnx_path,
+                compile_dir,
+                compile_only=True,
+                retained_state=True,
+                specializations=specializations,
+                convert_to_fp16=True,
+                mxfp6_matmul=mxfp6_matmul,
+                custom_io=custom_io,
+                mdp_ts_num_devices=num_devices,
+                num_speculative_tokens=num_speculative_tokens,
+                aic_num_cores=num_cores,
+                **compiler_options,
+            )
+        return qpc_path
 
     # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
     def generate(
diff --git a/docs/source/hl_api.md b/docs/source/hl_api.md
index 558965e7..5662b23a 100644
--- a/docs/source/hl_api.md
+++ b/docs/source/hl_api.md
@@ -47,6 +47,13 @@
    import QEfficient
    base_path, onnx_model_path = QEfficient.export(model_name="gpt2")
    qpc_path = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0])
+
+   # Similarly for QPC Compiled via QNN SDK
+   # 1. export $QNN_SDK_ROOT=/path/to/qnn_sdk_folder
+   # 2. add --enable_qnn in the command
+   # 3. An optional config file can be provided via qnn_config if user wish to override the default parameters.
+   qpc_path_qnn = QEfficient.compile(onnx_path=onnx_model_path, qpc_path=os.path.join(base_path, "qpc"), num_cores=14, device_group=[0],
+                                     enable_qnn=True, qnn_config = "QEfficient/compile/qnn_config.json")
 .. deprecated::
    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.compile instead
 ```
@@ -54,6 +61,6 @@
 ```{eval-rst}
 .. automodule:: QEfficient.generation.text_generation_inference
    :members:
-   :show-inheritance: 
+   :show-inheritance:
    :exclude-members:  latency_stats_bertstyle,cloud_ai_100_exec_kv_helper
 ```
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index f05540c8..c9f17a73 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -32,7 +32,7 @@ pipeline {
            parallel {
                stage('Run Non-CLI Non-QAIC Tests') {
                    steps {
-                       timeout(time: 10, unit: 'MINUTES') {
+                       timeout(time: 25, unit: 'MINUTES') {
                            sh '''
                            sudo docker exec ${BUILD_TAG} bash -c "
                            cd /efficient-transformers &&
@@ -56,7 +56,7 @@ pipeline {
                            mkdir -p $PWD/Non_qaic &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_qaic &&
-                           pytest tests -m '(not cli) and (on_qaic)' -n 4 --junitxml=tests/tests_log2.xml &&
+                           pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 4 --junitxml=tests/tests_log2.xml &&
                            deactivate"
                            '''
                        }
@@ -84,7 +84,7 @@ pipeline {
             steps {
                 timeout(time: 30, unit: 'MINUTES') {
                     sh '''
-                    docker exec ${BUILD_TAG} bash -c "
+                    sudo docker exec ${BUILD_TAG} bash -c "
                     source /qnn_sdk/bin/envsetup.sh &&
                     source /qnn_sdk/bin/envcheck -c &&
                     cd /efficient-transformers &&
@@ -93,7 +93,25 @@ pipeline {
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_cli &&
                     pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log4.xml &&
-                    junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log4.xml tests/tests_log.xml &&
+                    deactivate"
+                    '''
+                }
+            }
+        }
+        stage('QNN Non-CLI Tests') {
+            steps {
+                timeout(time: 60, unit: 'MINUTES') {
+                    sh '''
+                    sudo docker exec ${BUILD_TAG} bash -c "
+                    source /qnn_sdk/bin/envsetup.sh &&
+                    source /qnn_sdk/bin/envcheck -c &&
+                    cd /efficient-transformers &&
+                    . preflight_qeff/bin/activate &&
+                    mkdir -p $PWD/Qnn_non_cli &&
+                    export TOKENIZERS_PARALLELISM=false &&
+                    export QEFF_HOME=$PWD/Qnn_non_cli &&
+                    pytest tests -m '(not cli) and (qnn) and (on_qaic)' --junitxml=tests/tests_log5.xml &&
+                    junitparser merge tests/tests_log1.xml tests/tests_log2.xml tests/tests_log3.xml tests/tests_log4.xml tests/tests_log5.xml tests/tests_log.xml &&
                     deactivate"
                     '''
                 }
diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py
new file mode 100644
index 00000000..50ad3551
--- /dev/null
+++ b/tests/qnn_tests/test_causal_lm_models_qnn.py
@@ -0,0 +1,172 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import numpy as np
+import pytest
+from transformers import AutoModelForCausalLM
+
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
+from QEfficient.utils import hf_download
+from QEfficient.utils._utils import load_hf_tokenizer
+from QEfficient.utils.constants import Constants
+from QEfficient.utils.device_utils import get_available_device_id
+from QEfficient.utils.run_utils import ApiRunner
+
+test_models = [
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "gpt2",
+]
+
+
+def load_causal_lm_model(model_config):
+    """
+    Function to load model from huggingface and transform to KV model
+    --------
+
+    :model_config: Dict
+
+    :return model_hf, params
+    """
+    model_path = hf_download(
+        repo_id=model_config["model_name"],
+        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
+    )
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        use_cache=True,
+        num_hidden_layers=model_config["n_layer"],
+        attn_implementation="eager",
+        low_cpu_mem_usage=False,
+    )  # Run models for single layers only
+    params = sum(p.numel() for p in model_hf.parameters())
+    model_hf.eval()
+    return model_hf, params
+
+
+def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+    model_name: str,
+    prompt_len: int = Constants.PROMPT_LEN,
+    ctx_len: int = Constants.CTX_LEN,
+    n_layer: int = 1,
+):
+    """
+    Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+        :prompt_len (int): Prompt length for the model to compile.
+        :ctx_len (int): Maximum context length to compile the model.
+        :n_layers (int): Number of layers for the Model.
+    """
+    replace_transformers_quantizers()
+    model_config = {"model_name": model_name}
+    model_config["n_layer"] = n_layer
+
+    model_hf, _ = load_causal_lm_model(model_config)
+
+    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
+    config = model_hf.config
+    batch_size = len(Constants.INPUT_STR)
+    api_runner = ApiRunner(
+        batch_size,
+        tokenizer,
+        config,
+        Constants.INPUT_STR,
+        Constants.PROMPT_LEN,
+        Constants.CTX_LEN,
+    )
+
+    pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
+
+    qeff_model = QEFFAutoModelForCausalLM(model_hf)
+
+    pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
+
+    assert (
+        pytorch_hf_tokens == pytorch_kv_tokens
+    ).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
+
+    onnx_model_path = qeff_model.export()
+    ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path)
+
+    assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output."
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        enable_qnn=True,
+    )
+    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
+    cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
+    gen_len = ort_tokens.shape[-1]
+    assert (
+        ort_tokens == cloud_ai_100_tokens[:, :gen_len]
+    ).all(), "Tokens don't match for ONNXRT output and Cloud AI 100 output."
+
+    # testing for CB models
+    model_hf, _ = load_causal_lm_model(model_config)
+    full_batch_size = 4
+    fbs_prompts = Constants.INPUT_STR * 4
+    api_runner = ApiRunner(
+        batch_size,
+        tokenizer,
+        config,
+        fbs_prompts,
+        Constants.PROMPT_LEN,
+        Constants.CTX_LEN,
+        full_batch_size,
+    )
+
+    pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf)
+    pytorch_hf_tokens = np.vstack(pytorch_hf_tokens)
+
+    qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True)
+    onnx_model_path = qeff_model.export()
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        full_batch_size=full_batch_size,
+        enable_qnn=True,
+    )
+    exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
+
+    assert all(
+        [
+            all(pt_token[:24] == cloud_token[:24])
+            for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
+        ]
+    ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", test_models)
+def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    if model_name == "microsoft/Phi-3-mini-4k-instruct":
+        n_layer = 2  # test only 2 layer models
+    else:
+        n_layer = 1
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)