diff --git a/docs/Memory_Optimizer.md b/docs/Memory_Optimizer.md index 3ef3a575f20a1..e9ceae00a684d 100644 --- a/docs/Memory_Optimizer.md +++ b/docs/Memory_Optimizer.md @@ -20,10 +20,10 @@ Not all models and recipes need this optimizer technique. Imagine if your traini ## Quick trial 1. Make sure ONNX Runtime training wheel is installed and correctly configured. -2. Integrate models using `ORTModule`, be noted log_level should be equal or lower than INFO. - > ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.INFO)) -3. Run the training as usual and redirect all outputs into log file; then stop it after training few steps. -4. Check the logging file, search "Summary", you could possibly find something like this: +2. Integrate models using `ORTModule`, be noted log_level should be equal to or lower than DEVINFO. + > ort_model = ORTModule(pt_model, DebugOptions(log_level=LogLevel.DEVINFO)) +3. Run the training as usual and redirect all outputs into the log file; then stop it after training a few steps. +4. Check the logging file, and search "Summary", you could find something like this: ``` MemoryOptimizer Summary: User config: diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md index 5350988b20964..12733c3551704 100644 --- a/docs/ORTModule_Training_Guidelines.md +++ b/docs/ORTModule_Training_Guidelines.md @@ -49,6 +49,90 @@ More options for **developers**. ``` Check [DebugOptions implementation](../orttraining/orttraining/python/training/ortmodule/options.py) for more details. +#### Log Level Explanations + + + + + + + + + + + + + + + + + + + + + + + + +
Log LevelDescription
+ +`FATAL` | `ERROR` | `WARNING` (For Users) + +`WARNING` is the default and recommended level for +
users.
+
+ +- ONNX Runtime backend log level - `FATAL` | `ERROR` | `WARNING`. +- ORTModule log level - `FATAL` | `ERROR` | `WARNING`. +- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only). +- PyTorch exporter export logs filtering is `ON`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `ON`. + +
+ +`INFO` (For Users | ORT Developers) + +`INFO` is used for collecting experimental +
feature stats, or a little bit more error messages.
+
+ +- ONNX Runtime backend log level - `WARNING`. +- ORTModule log level - `INFO`. +- Rank-0 log filtering is `ON` (e.g. logging on rank-0-only). +- PyTorch exporter export logs filtering is `ON`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ +`DEVINFO` (For ORT Developers) + +`DEVINFO` is the recommended level for +
debugging purposes.
+
+ +- ONNX Runtime backend log level - `INFO`. +- ORTModule log level - `INFO`. +- Rank-0 log filtering is `OFF` (e.g. logging on all ranks). +- PyTorch exporter export logs filtering is `OFF`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ +`VERBOSE` (For ORT Developers) + +`VERBOSE` is the last resort for debugging +
hard problems.
+
+ +- ONNX Runtime backend log level - `VERBOSE`. +- ORTModule log level - `VERBOSE`. +- Rank-0 log filtering is `OFF` (e.g. logging on all ranks). +- PyTorch exporter export logs filtering is `OFF`. +- PyTorch exporter verbose logs (including tracing graph) filtering is `OFF`. + +
+ + ### 2.1 Environment Variables `ORTModule` provides environment variables targeting different use cases. diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 04820218b7c49..5eb1d9f382380 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -473,7 +473,6 @@ def _get_graph_transformer_config(self) -> C.TrainingGraphTransformerConfigurati return graph_transformer_config @_logger.TrackTime(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT) - @_logger.SuppressLogs(_logger.ORTModuleInitPhase.GRAPH_BUILDER_INIT) def _initialize_graph_builder(self): """Creates a new OrtModuleGraphBuilder, initializes it and saves it to self._graph_builder""" diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py index 8d8be81c549d1..6690af9b71bf1 100644 --- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py @@ -16,7 +16,7 @@ from ._fallback import ORTModuleFallbackException, _FallbackManager, _FallbackPolicy from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo from ._io import unflatten_user_output -from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime +from ._logger import ORTModuleInitPhase, TrackTime from ._utils import save_tuning_results, set_tuning_results from .options import DebugOptions, _SkipCheck @@ -207,7 +207,6 @@ def forward(self, *inputs, **kwargs): return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs) @TrackTime(ORTModuleInitPhase.BUILD_GRAPH) - @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH) def _build_graph(self, graph_transformer_config): """Build an inference graph using the module_graph_builder""" @@ -221,7 +220,6 @@ def _build_graph(self, graph_transformer_config): ) @TrackTime(ORTModuleInitPhase.CREATE_SESSION) - @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION) def _create_execution_agent(self): """Creates an InferenceAgent that can run forward graph on an inference model""" diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py index e075ced8eaac2..0728ebdf19af8 100644 --- a/orttraining/orttraining/python/training/ortmodule/_logger.py +++ b/orttraining/orttraining/python/training/ortmodule/_logger.py @@ -21,15 +21,18 @@ class LogLevel(IntEnum): VERBOSE = 0 - INFO = 1 - WARNING = 2 - ERROR = 3 - FATAL = 4 + DEVINFO = 1 # For ORT developers. + INFO = 2 # For ORT users. + WARNING = 3 + ERROR = 4 + FATAL = 5 ORTMODULE_LOG_LEVEL_MAP: Dict[LogLevel, List[int]] = { LogLevel.VERBOSE: [Severity.VERBOSE, logging.DEBUG], - LogLevel.INFO: [Severity.INFO, logging.INFO], + LogLevel.DEVINFO: [Severity.INFO, logging.INFO], + # ONNX Runtime has too many INFO logs, so we map it to WARNING for a better user experience. + LogLevel.INFO: [Severity.WARNING, logging.INFO], LogLevel.WARNING: [Severity.WARNING, logging.WARNING], LogLevel.ERROR: [Severity.ERROR, logging.ERROR], LogLevel.FATAL: [Severity.FATAL, logging.FATAL], @@ -48,13 +51,13 @@ def configure_ortmodule_logger(log_level: LogLevel) -> logging.Logger: """Configure the logger for ortmodule according to following rules. 1. If multiple processes are used, the rank will be appended to the logger name. - 2. If the log level is greater than info, the logger will be + 2. If the log level is equal to or greater than INFO, the logger will be disabled for non-zero ranks. """ rank_info = f".rank-{get_rank()}" if get_world_size() > 1 else "" logger = logging.getLogger(f"orttraining{rank_info}") - # Disable the logger for non-zero ranks when level > info - logger.disabled = log_level > LogLevel.INFO and get_rank() != 0 + # Disable the logger for non-zero ranks when level >= INFO + logger.disabled = log_level >= LogLevel.INFO and get_rank() != 0 logger.setLevel(ortmodule_loglevel_to_python_loglevel(log_level)) return logger diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py index e0f11e5aa407e..bafb64235546b 100644 --- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py @@ -18,7 +18,7 @@ from ._gradient_accumulation_manager import GradientAccumulationManager from ._graph_execution_manager import GraphExecutionManager, _RunStateInfo from ._io import _FlattenedModule, _InputInfo, unflatten_user_output -from ._logger import ORTModuleInitPhase, SuppressLogs, TrackTime +from ._logger import ORTModuleInitPhase, TrackTime from ._runtime_inspector import Phase from ._utils import save_tuning_results, set_tuning_results from .graph_optimizer_registry import GraphOptimizerRegistry @@ -358,7 +358,6 @@ def forward(self, *inputs, **kwargs): return self._fallback_manager.fallback(self._debug_options.logging.log_level, *inputs, **kwargs) @TrackTime(ORTModuleInitPhase.BUILD_GRAPH) - @SuppressLogs(ORTModuleInitPhase.BUILD_GRAPH) def _build_graph(self, graph_transformer_config): """Build an optimized gradient graph using the module_graph_builder""" @@ -401,7 +400,6 @@ def _build_graph(self, graph_transformer_config): self._gradient_map.append(-1) @TrackTime(ORTModuleInitPhase.CREATE_SESSION) - @SuppressLogs(ORTModuleInitPhase.CREATE_SESSION) def _create_execution_agent(self): """Creates a TrainingAgent that can run the forward and backward graph on the training model""" diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py index ff0cde37195cb..cddd9cd440b28 100644 --- a/orttraining/orttraining/python/training/ortmodule/options.py +++ b/orttraining/orttraining/python/training/ortmodule/options.py @@ -167,11 +167,6 @@ def torch_exporter_filter(self): @property def onnxruntime_log_filter(self): """Accessor for the filter onnxruntime logs configuration.""" - if self.log_level >= LogLevel.INFO: - return [ - "CleanUnusedInitializersAndNodeArgs] Removing initializer", - "Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED", - ] return None