triton-inference-server · nv-braf · Dec 20, 2024 · Dec 17, 2024 · Dec 19, 2024
diff --git a/model_analyzer/analyzer.py b/model_analyzer/analyzer.py
@@ -214,7 +214,11 @@ def _create_model_manager(self, client, gpus):
     def _get_server_only_metrics(self, client, gpus):
         if self._config.triton_launch_mode != "c_api":
             if not self._state_manager._starting_fresh_run:
-                if self._do_checkpoint_gpus_match(gpus):
+                if self._config.dcgm_disable:
+                    logger.info(
+                        "DCGM is disabled - cannot verify that GPU devices match checkpoint"
+                    )
+                elif self._do_checkpoint_gpus_match(gpus):
                     logger.info(
                         "GPU devices match checkpoint - skipping server metric acquisition"
                     )

diff --git a/model_analyzer/config/input/config_command.py b/model_analyzer/config/input/config_command.py
@@ -129,6 +129,7 @@ def _check_for_illegal_config_settings(
         self._check_for_bls_incompatibility(args, yaml_config)
         self._check_for_concurrency_rate_request_conflicts(args, yaml_config)
         self._check_for_config_search_rate_request_conflicts(args, yaml_config)
+        self._check_for_dcgm_disable_launch_mode_conflict(args, yaml_config)
 
     def _set_field_values(
         self, args: Namespace, yaml_config: Optional[Dict[str, List]]
@@ -398,6 +399,19 @@ def _check_for_config_search_rate_request_conflicts(
                     f"\nCannot have both `run-config-search-max-request-rate` and `run-config-search-min/max-concurrency` specified in the config/CLI."
                 )
 
+    def _check_for_dcgm_disable_launch_mode_conflict(
+        self, args: Namespace, yaml_config: Optional[Dict[str, List]]
+    ) -> None:
+        if self._get_config_value("dcgm_disable", args, yaml_config):
+            launch_mode = self._get_config_value(
+                "triton_launch_mode", args, yaml_config
+            )
+
+            if launch_mode != "remote":
+                raise TritonModelAnalyzerException(
+                    f"\nIf `dcgm-disable` then `triton-launch-mode` must be set to remote"
+                )
+
     def _preprocess_and_verify_arguments(self):
         """
         Enforces some rules on the config.

diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
@@ -45,6 +45,7 @@
     DEFAULT_CLIENT_PROTOCOL,
     DEFAULT_COLLECT_CPU_METRICS,
     DEFAULT_CONCURRENCY_SWEEP_DISABLE,
+    DEFAULT_DCGM_DISABLE,
     DEFAULT_DURATION_SECONDS,
     DEFAULT_EXPORT_PATH,
     DEFAULT_FILENAME_MODEL_GPU,
@@ -288,6 +289,16 @@ def _fill_config(self):
                 description="Report GPU metrics, even when the model is `cpu_only`.",
             )
         )
+        self._add_config(
+            ConfigField(
+                "dcgm_disable",
+                field_type=ConfigPrimitive(bool),
+                flags=["--dcgm-disable"],
+                parser_args={"action": "store_true"},
+                default_value=DEFAULT_DCGM_DISABLE,
+                description="Disables DCGM, which prevents obtaining information about GPUs",
+            )
+        )
         self._add_config(
             ConfigField(
                 "skip_summary_reports",

diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
@@ -62,6 +62,7 @@
 DEFAULT_USE_CONCURRENCY_FORMULA = False
 DEFAULT_REQUEST_RATE_SEARCH_ENABLE = False
 DEFAULT_CONCURRENCY_SWEEP_DISABLE = False
+DEFAULT_DCGM_DISABLE = False
 DEFAULT_TRITON_LAUNCH_MODE = "local"
 DEFAULT_TRITON_DOCKER_IMAGE = "nvcr.io/nvidia/tritonserver:24.11-py3"
 DEFAULT_TRITON_HTTP_ENDPOINT = "localhost:8000"

diff --git a/model_analyzer/entrypoint.py b/model_analyzer/entrypoint.py
@@ -260,7 +260,11 @@
                 )
 
             # Set up devices
-            gpus = GPUDeviceFactory().verify_requested_gpus(config.gpus)
+            foo = config.dcgm_disable
+            if config.dcgm_disable:
+                gpus = []
+            else:
+                gpus = GPUDeviceFactory().verify_requested_gpus(config.gpus)
 
             # Check/create output model repository
             create_output_model_repository(config)

diff --git a/model_analyzer/triton/server/server_factory.py b/model_analyzer/triton/server/server_factory.py
@@ -126,7 +126,9 @@ def get_server_handle(config, gpus, use_model_repository=False):
         """
 
         if config.triton_launch_mode == "remote":
-            server = TritonServerFactory._get_remote_server_handle(config)
+            server = TritonServerFactory._get_remote_server_handle(
+                config, print_warning_message=use_model_repository
+            )
         elif config.triton_launch_mode == "local":
             server = TritonServerFactory._get_local_server_handle(
                 config, gpus, use_model_repository=True
@@ -147,22 +149,23 @@ def get_server_handle(config, gpus, use_model_repository=False):
         return server
 
     @staticmethod
-    def _get_remote_server_handle(config):
+    def _get_remote_server_handle(config, print_warning_message=True):
         triton_config = TritonServerConfig()
         triton_config.update_config(config.triton_server_flags)
         triton_config["model-repository"] = "remote-model-repository"
         logger.info("Using remote Triton Server")
         server = TritonServerFactory.create_server_local(
             path=None, config=triton_config, gpus=[], log_path=""
         )
-        logger.warning(
-            "GPU memory metrics reported in the remote mode are not"
-            " accurate. Model Analyzer uses Triton explicit model control to"
-            " load/unload models. Some frameworks do not release the GPU"
-            " memory even when the memory is not being used. Consider"
-            ' using the "local" or "docker" mode if you want to accurately'
-            " monitor the GPU memory usage for different models."
-        )
+        if print_warning_message:
+            logger.warning(
+                "GPU memory metrics reported in the remote mode are not"
+                " accurate. Model Analyzer uses Triton explicit model control to"
+                " load/unload models. Some frameworks do not release the GPU"
+                " memory even when the memory is not being used. Consider"
+                ' using the "local" or "docker" mode if you want to accurately'
+                " monitor the GPU memory usage for different models."
+            )
 
         return server
 

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -67,6 +67,7 @@ def get_test_options():
         OptionStruct("bool", "profile","--always-report-gpu-metrics"),
         OptionStruct("bool", "profile","--use-concurrency-formula"),
         OptionStruct("bool", "profile","--concurrency-sweep-disable"),
+        OptionStruct("bool", "profile","--dcgm-disable"),
 
 
         #Int/Float options

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -2388,6 +2388,82 @@ def test_model_type_llm(self):
             config.inference_output_fields, DEFAULT_LLM_INFERENCE_OUTPUT_FIELDS
         )
 
+    def test_dcgm_disable_and_launch_mode(self):
+        """
+        Test that launch mode is set to remote when dcgm is disabled
+        """
+
+        # Should raise an exception for docker, local, and c_api launch modes
+        args = [
+            "model-analyzer",
+            "profile",
+            "--profile-models",
+            "modelA",
+            "--model-repository",
+            "cli-repository",
+            "-f",
+            "path-to-config-file",
+            "--dcgm-disable",
+            "--triton-launch-mode",
+            "docker",
+        ]
+
+        yaml_content = ""
+
+        with self.assertRaises(TritonModelAnalyzerException):
+            self._evaluate_config(args, yaml_content, subcommand="profile")
+
+        args = [
+            "model-analyzer",
+            "profile",
+            "--profile-models",
+            "modelA",
+            "--model-repository",
+            "cli-repository",
+            "-f",
+            "path-to-config-file",
+            "--dcgm-disable",
+            "--triton-launch-mode",
+            "local",
+        ]
+
+        with self.assertRaises(TritonModelAnalyzerException):
+            self._evaluate_config(args, yaml_content, subcommand="profile")
+
+        args = [
+            "model-analyzer",
+            "profile",
+            "--profile-models",
+            "modelA",
+            "--model-repository",
+            "cli-repository",
+            "-f",
+            "path-to-config-file",
+            "--dcgm-disable",
+            "--triton-launch-mode",
+            "c_api",
+        ]
+
+        with self.assertRaises(TritonModelAnalyzerException):
+            self._evaluate_config(args, yaml_content, subcommand="profile")
+
+        # Should not raise an exception for remote mode
+        args = [
+            "model-analyzer",
+            "profile",
+            "--profile-models",
+            "modelA",
+            "--model-repository",
+            "cli-repository",
+            "-f",
+            "path-to-config-file",
+            "--dcgm-disable",
+            "--triton-launch-mode",
+            "remote",
+        ]
+
+        _ = self._evaluate_config(args, yaml_content, subcommand="profile")
+
     def _test_request_rate_config_conflicts(
         self, base_args: List[Any], yaml_content: str
     ) -> None: