From 2151c79bf190cdd71794d224f8efea4f8b9cd207 Mon Sep 17 00:00:00 2001 From: pengwa Date: Wed, 8 Nov 2023 17:42:50 +0800 Subject: [PATCH] Tune ORTModule logging experience a bit (#18298) ### Tune logging experience a bit After last time we update the ORTModule log experience, we found few issues: 1. `INFO` level output too many things, including PyTorch exporter verbose logs (tracing graphs) on every ranks. On this level, we only want to - Output a little bit more information to Users than `WARNING` level, for example the memory recomputation recommendations or other not-fully-ready features. - Output a little bit more information for a quick diagnostic, collected on rank-0 only. 2. ONNX Runtime logging filter during graph build, session init sometimes will hide the issues (for example segement fault), there is no useful information in `WARNING`/`INFO` for users to report to us. This is not good! 3. Some of our devs like using `pdb` to debug Python code, but if we add `import pdb; pdb.set_trace()` in models' code might hang when they use `INFO` or `WARNING`, where exporter happens and all output got redirected due to log filtering. The only workaround is to switch to VERBOSE, which output toooooooooooo many logs. The corresponding changes proposed here are: 1. For `INFO` logging, - We only logs rank-0. - We restricted the ORT backend logging level to be WARNING in this case, because ORT backend code output way too many logs that should be under verbose, while we cannot guarantee we can get them cleaned up immediately once they are added. - We output the PyTorch exporter verbose log (including tracing graph), which is useful for a quick diagnostic when an issue happens. 2. Remove all logging filtering on ORT backend, then the segment fault issue details will not be hidden once it happens again. 3. Introduced a `DEVINFO` logging, - Log logs on all ranks - Log ORT backend logging level INFO - PyTorch exporter logging filtering are all turned OFF (to unblock the pdb debugging). 4. Currently, to use Memory Optimizer, need use DEVINFO (which will output ORT backend INFO log). So update memory optimizer document to reflect this. https://github.com/microsoft/onnxruntime/pull/17481 will update the requirement back to INFO for show memory optimization infos. You can check https://github.com/microsoft/onnxruntime/blob/pengwa/devinfo_level/docs/ORTModule_Training_Guidelines.md#log-level-explanations for a better view of different log levels. This PR also extract some changes from a bigger one https://github.com/microsoft/onnxruntime/pull/17481, to reduce its complexity for review. ### Motivation and Context --------- Co-authored-by: mindest <30493312+mindest@users.noreply.github.com> --- docs/Memory_Optimizer.md | 8 +- docs/ORTModule_Training_Guidelines.md | 84 +++++++++++++++++++ .../ortmodule/_graph_execution_manager.py | 1 - .../training/ortmodule/_inference_manager.py | 4 +- .../python/training/ortmodule/_logger.py | 19 +++-- .../training/ortmodule/_training_manager.py | 4 +- .../python/training/ortmodule/options.py | 5 -- 7 files changed, 101 insertions(+), 24 deletions(-) diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md index 3ef3a575f20a1..e9ceae00a684d 100644 --- a/docs/Memory_Optimizer.md +++ b/docs/Memory_Optimizer.md @@ -20,10 +20,10 @@ Not all models and recipes need this optimizer technique. Imagine if your traini ## Quick trial 1. Make sure ONNX Runtime training wheel is installed and correctly configured. -2. Integrate models using `ORTModule`, be noted log_level should be equal or lower than INFO. - > ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.INFO)) -3. Run the training as usual and redirect all outputs into log file; then stop it after training few steps. -4. Check the logging file, search "Summary", you could possibly find something like this: +2. Integrate models using `ORTModule`, be noted log_level should be equal to or lower than DEVINFO. + > ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.DEVINFO)) +3. Run the training as usual and redirect all outputs into the log file; then stop it after training a few steps. +4. Check the logging file, and search "Summary", you could find something like this: ``` MemoryOptimizer Summary: User config: diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md index 5350988b20964..12733c3551704 100644 --- a/docs/ORTModule_Training_Guidelines.md +++ b/docs/ORTModule_Training_Guidelines.md @@ -49,6 +49,90 @@ More options for **developers**. ``` Check [DebugOptions implementation](../orttraining/orttraining/python/training/ortmodule/options.py) for more details. +#### Log Level Explanations + + + + + + + + + + + + + + + + + + + + + + + + +
Log LevelDescription
+ +`FATAL` | `ERROR` | `WARNING` (For Users) + +`WARNING` is the default and recommended level for +
users.
+
+ +- ONNX Runtime backend log level - `FATAL` | `ERROR` | `WARNING`. +- ORTModule log level - `FATAL` | `ERROR` | `WARNING`. +- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only). +- PyTorch exporter export logs filtering is `ON`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `ON`. + +
+ +`INFO` (For Users | ORT Developers) + +`INFO` is used for collecting experimental +
feature stats, or a little bit more error messages.
+
+ +- ONNX Runtime backend log level - `WARNING`. +- ORTModule log level - `INFO`. +- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only). +- PyTorch exporter export logs filtering is `ON`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ +`DEVINFO` (For ORT Developers) + +`DEVINFO` is the recommended level for +
debugging purposes.
+
+ +- ONNX Runtime backend log level - `INFO`. +- ORTModule log level - `INFO`. +- Rank-0 log filtering is `OFF` (e.g. logging on all ranks). +- PyTorch exporter export logs filtering is `OFF`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ +`VERBOSE` (For ORT Developers) + +`VERBOSE` is the last resort for debugging +
hard problems.
+
+ +- ONNX Runtime backend log level - `VERBOSE`. +- ORTModule log level - `VERBOSE`. +- Rank-0 log filtering is `OFF` (e.g. logging on all ranks). +- PyTorch exporter export logs filtering is `OFF`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ + ### 2.1 Environment Variables `ORTModule` provides environment variables targeting different use cases. diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 04820218b7c49..5eb1d9f382380 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -473,7 +473,6 @@ def _get_graph_transformer_config(self) -> C.TrainingGraphTransformerConfigurati return graph_transformer_config @_logger.TrackTime(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT) - @_logger.SuppressLogs(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT) def _initialize_graph_builder(self): """Creates a new OrtModuleGraphBuilder, initializes it and saves it to self._graph_builder""" diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py index 8d8be81c549d1..6690af9b71bf1 100644 --- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py @@ -16,7 +16,7 @@ from ._fallback import ORTModuleFallbackException, _FallbackManager, _FallbackPolicy from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo from ._io import unflatten_user_output -from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime +from ._logger import ORTModuleInitPhase, TrackTime from ._utils import save_tuning_results, set_tuning_results from .options import DebugOptions, _SkipCheck @@ -207,7 +207,6 @@ def forward(self, *inputs, **kwargs): return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs) @TrackTime(ORTModuleInitPhase.BUILD_GRAPH) - @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH) def _build_graph(self, graph_transformer_config): """Build an inference graph using the module_graph_builder""" @@ -221,7 +220,6 @@ def _build_graph(self, graph_transformer_config): ) @TrackTime(ORTModuleInitPhase.CREATE_SESSION) - @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION) def _create_execution_agent(self): """Creates an InferenceAgent that can run forward graph on an inference model""" diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py index e075ced8eaac2..0728ebdf19af8 100644 --- a/orttraining/orttraining/python/training/ortmodule/_logger.py +++ b/orttraining/orttraining/python/training/ortmodule/_logger.py @@ -21,15 +21,18 @@ class LogLevel(IntEnum): VERBOSE = 0 - INFO = 1 - WARNING = 2 - ERROR = 3 - FATAL = 4 + DEVINFO = 1 # For ORT developers. + INFO = 2 # For ORT users. + WARNING = 3 + ERROR = 4 + FATAL = 5 ORTMODULE_LOG_LEVEL_MAP: Dict[LogLevel, List[int]] = { LogLevel.VERBOSE: [Severity.VERBOSE, logging.DEBUG], - LogLevel.INFO: [Severity.INFO, logging.INFO], + LogLevel.DEVINFO: [Severity.INFO, logging.INFO], + # ONNX Runtime has too many INFO logs, so we map it to WARNING for a better user experience. + LogLevel.INFO: [Severity.WARNING, logging.INFO], LogLevel.WARNING: [Severity.WARNING, logging.WARNING], LogLevel.ERROR: [Severity.ERROR, logging.ERROR], LogLevel.FATAL: [Severity.FATAL, logging.FATAL], @@ -48,13 +51,13 @@ def configure_ortmodule_logger(log_level: LogLevel) -> logging.Logger: """Configure the logger for ortmodule according to following rules. 1. If multiple processes are used, the rank will be appended to the logger name. - 2. If the log level is greater than info, the logger will be + 2. If the log level is equal to or greater than INFO, the logger will be disabled for non-zero ranks. """ rank_info = f".rank-{get_rank()}" if get_world_size() > 1 else "" logger = logging.getLogger(f"orttraining{rank_info}") - # Disable the logger for non-zero ranks when level > info - logger.disabled = log_level > LogLevel.INFO and get_rank() != 0 + # Disable the logger for non-zero ranks when level >= INFO + logger.disabled = log_level >= LogLevel.INFO and get_rank() != 0 logger.setLevel(ortmodule_loglevel_to_python_loglevel(log_level)) return logger diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py index e0f11e5aa407e..bafb64235546b 100644 --- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py @@ -18,7 +18,7 @@ from ._gradient_accumulation_manager import GradientAccumulationManager from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo from ._io import _FlattenedModule, _InputInfo, unflatten_user_output -from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime +from ._logger import ORTModuleInitPhase, TrackTime from ._runtime_inspector import Phase from ._utils import save_tuning_results, set_tuning_results from .graph_optimizer_registry import GraphOptimizerRegistry @@ -358,7 +358,6 @@ def forward(self, *inputs, **kwargs): return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs) @TrackTime(ORTModuleInitPhase.BUILD_GRAPH) - @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH) def _build_graph(self, graph_transformer_config): """Build an optimized gradient graph using the module_graph_builder""" @@ -401,7 +400,6 @@ def _build_graph(self, graph_transformer_config): self._gradient_map.append(-1) @TrackTime(ORTModuleInitPhase.CREATE_SESSION) - @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION) def _create_execution_agent(self): """Creates a TrainingAgent that can run the forward and backward graph on the training model""" diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py index ff0cde37195cb..cddd9cd440b28 100644 --- a/orttraining/orttraining/python/training/ortmodule/options.py +++ b/orttraining/orttraining/python/training/ortmodule/options.py @@ -167,11 +167,6 @@ def torch_exporter_filter(self): @property def onnxruntime_log_filter(self): """Accessor for the filter onnxruntime logs configuration.""" - if self.log_level >= LogLevel.INFO: - return [ - "CleanUnusedInitializersAndNodeArgs] Removing initializer", - "Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED", - ] return None