From 61ae5e9124db7a2592e20ad141a091ca1b195235 Mon Sep 17 00:00:00 2001 From: RandomDefaultUser Date: Tue, 26 Nov 2024 10:45:09 +0000 Subject: [PATCH] deploy: db47311a98e5e6a37d0dd8134027d6b77efa16ca --- _modules/mala/common/parameters.html | 48 +++++++-- _sources/advanced_usage/trainingmodel.rst.txt | 61 ++++++++++-- advanced_usage/trainingmodel.html | 65 ++++++++++-- api/mala.common.html | 5 + api/mala.common.parameters.html | 93 ++++++++++++++++-- api/mala.html | 5 + api/modules.html | 5 + genindex.html | 16 ++- objects.inv | Bin 6340 -> 6378 bytes searchindex.js | 2 +- 10 files changed, 259 insertions(+), 41 deletions(-) diff --git a/_modules/mala/common/parameters.html b/_modules/mala/common/parameters.html index 292a198ea..1677e13fe 100644 --- a/_modules/mala/common/parameters.html +++ b/_modules/mala/common/parameters.html @@ -328,7 +328,6 @@

Source code for mala.common.parameters

     ----------
     nn_type : string
         Type of the neural network that will be used. Currently supported are
-
             - "feed_forward" (default)
             - "transformer"
             - "lstm"
@@ -382,12 +381,12 @@ 

Source code for mala.common.parameters

         self.layer_activations = ["Sigmoid"]
         self.loss_function_type = "mse"
 
-        # for LSTM/Gru + Transformer
-        self.num_hidden_layers = 1
-
         # for LSTM/Gru
         self.no_hidden_state = False
         self.bidirection = False
+        
+        # for LSTM/Gru + Transformer
+        self.num_hidden_layers = 1
 
         # for transformer net
         self.dropout = 0.1
@@ -815,12 +814,15 @@ 

Source code for mala.common.parameters

         a "by snapshot" basis.
 
     checkpoints_each_epoch : int
-        If not 0, checkpoint files will be saved after eac
+        If not 0, checkpoint files will be saved after each
         checkpoints_each_epoch epoch.
 
     checkpoint_name : string
         Name used for the checkpoints. Using this, multiple runs
         can be performed in the same directory.
+        
+    run_name : string
+        Name of the run used for logging.
 
     logging_dir : string
         Name of the folder that logging files will be saved to.
@@ -829,6 +831,34 @@ 

Source code for mala.common.parameters

         If True, then upon creating logging files, these will be saved
         in a subfolder of logging_dir labelled with the starting date
         of the logging, to avoid having to change input scripts often.
+        
+    logger : string
+        Name of the logger to be used.
+        Currently supported are:
+        
+            - "tensorboard": Tensorboard logger.
+            - "wandb": Weights and Biases logger.
+    
+    validation_metrics : list
+        List of metrics to be used for validation. Default is ["ldos"].
+        Possible options are:
+        
+            - "ldos": MSE of the LDOS.
+            - "band_energy": Band energy.
+            - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy.
+            - "total_energy": Total energy.
+            - "total_energy_actual_fe": Total energy computed with ground truth Fermi energy.
+            - "fermi_energy": Fermi energy.
+            - "density": Electron density.
+            - "density_relative": Rlectron density (MAPE).
+            - "dos": Density of states.
+            - "dos_relative": Density of states (MAPE).
+            
+    validate_on_training_data : bool
+        Whether to validate on the training data as well. Default is False.
+        
+    validate_every_n_epochs : int
+        Determines how often validation is performed. Default is 1.
 
     inference_data_grid : list
         List holding the grid to be used for inference in the form of
@@ -843,19 +873,18 @@ 

Source code for mala.common.parameters

 
     profiler_range : list
         List with two entries determining with which batch/iteration number
-         the CUDA profiler will start and stop profiling. Please note that
-         this option only holds significance if the nsys profiler is used.
+        the CUDA profiler will start and stop profiling. Please note that
+        this option only holds significance if the nsys profiler is used.
     """
 
     def __init__(self):
         super(ParametersRunning, self).__init__()
         self.optimizer = "Adam"
-        self.learning_rate = 10 ** (-5)
+        self.learning_rate = 0.5
         self.learning_rate_embedding = 10 ** (-4)
         self.max_number_epochs = 100
         self.verbosity = True
         self.mini_batch_size = 10
-        self.snapshots_per_epoch = -1
 
         self.l1_regularization = 0.0
         self.l2_regularization = 0.0
@@ -874,7 +903,6 @@ 

Source code for mala.common.parameters

         self.num_workers = 0
         self.use_shuffling_for_samplers = True
         self.checkpoints_each_epoch = 0
-        self.checkpoint_best_so_far = False
         self.checkpoint_name = "checkpoint_mala"
         self.run_name = ""
         self.logging_dir = "./mala_logging"
diff --git a/_sources/advanced_usage/trainingmodel.rst.txt b/_sources/advanced_usage/trainingmodel.rst.txt
index 290aa15f3..9b118d86b 100644
--- a/_sources/advanced_usage/trainingmodel.rst.txt
+++ b/_sources/advanced_usage/trainingmodel.rst.txt
@@ -194,22 +194,64 @@ keyword, you can fine-tune the number of new snapshots being created.
 By default, the same number of snapshots as had been provided will be created
 (if possible).
 
-Using tensorboard
-******************
+Logging metrics during training
+*******************************
+
+Training progress in MALA can be visualized via tensorboard or wandb, as also shown
+in the file ``advanced/ex03_tensor_board``. Simply select a logger prior to training as
+
+      .. code-block:: python
+
+            parameters.running.logger = "tensorboard"
+            parameters.running.logging_dir = "mala_vis"
 
-Training routines in MALA can be visualized via tensorboard, as also shown
-in the file ``advanced/ex03_tensor_board``. Simply enable tensorboard
-visualization prior to training via
+or
 
       .. code-block:: python
 
-            # 0: No visualizatuon, 1: loss and learning rate, 2: like 1,
-            # but additionally weights and biases are saved
-            parameters.running.logging = 1
+            import wandb
+            wandb.init(
+                  project="mala_training",
+                  entity="your_wandb_entity"
+            )
+            parameters.running.logger = "wandb"
             parameters.running.logging_dir = "mala_vis"
 
 where ``logging_dir`` specifies some directory in which to save the
-MALA logging data. Afterwards, you can run the training without any
+MALA logging data. You can also select which metrics to record via
+
+      .. code-block:: python
+
+            parameters.validation_metrics = ["ldos", "dos", "density", "total_energy"]
+
+Full list of available metrics:
+      - "ldos": MSE of the LDOS.
+      - "band_energy": Band energy.
+      - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy.
+      - "total_energy": Total energy.
+      - "total_energy_actual_fe": Total energy computed with ground truth Fermi energy.
+      - "fermi_energy": Fermi energy.
+      - "density": Electron density.
+      - "density_relative": Rlectron density (Mean Absolute Percentage Error).
+      - "dos": Density of states.
+      - "dos_relative": Density of states (Mean Absolute Percentage Error).
+
+To save time and resources you can specify the logging interval via
+
+      .. code-block:: python
+
+            parameters.running.validate_every_n_epochs = 10
+
+If you want to monitor the degree to which the model overfits to the training data,
+you can use the option
+
+      .. code-block:: python
+            
+            parameters.running.validate_on_training_data = True
+
+MALA will evaluate the validation metrics on the training set as well as the validation set.
+
+Afterwards, you can run the training without any
 other modifications. Once training is finished (or during training, in case
 you want to use tensorboard to monitor progress), you can launch tensorboard
 via
@@ -221,6 +263,7 @@ via
 The full path for ``path_to_log_directory`` can be accessed via
 ``trainer.full_logging_path``.
 
+If you're using wandb, you can monitor the training progress on the wandb website.
 
 Training in parallel
 ********************
diff --git a/advanced_usage/trainingmodel.html b/advanced_usage/trainingmodel.html
index a48c1b976..1a3405c79 100644
--- a/advanced_usage/trainingmodel.html
+++ b/advanced_usage/trainingmodel.html
@@ -59,7 +59,7 @@
 
  • Advanced training metrics
  • Checkpointing a training run
  • Using lazy loading
  • -
  • Using tensorboard
  • +
  • Logging metrics during training
  • Training in parallel
  • @@ -280,21 +280,65 @@

    Using lazy loading -

    Using tensorboard

    -

    Training routines in MALA can be visualized via tensorboard, as also shown -in the file advanced/ex03_tensor_board. Simply enable tensorboard -visualization prior to training via

    +
    +

    Logging metrics during training

    +

    Training progress in MALA can be visualized via tensorboard or wandb, as also shown +in the file advanced/ex03_tensor_board. Simply select a logger prior to training as

    -
    # 0: No visualizatuon, 1: loss and learning rate, 2: like 1,
    -# but additionally weights and biases are saved
    -parameters.running.logging = 1
    +
    parameters.running.logger = "tensorboard"
    +parameters.running.logging_dir = "mala_vis"
    +
    +
    +
    +

    or

    +
    +
    import wandb
    +wandb.init(
    +      project="mala_training",
    +      entity="your_wandb_entity"
    +)
    +parameters.running.logger = "wandb"
     parameters.running.logging_dir = "mala_vis"
     

    where logging_dir specifies some directory in which to save the -MALA logging data. Afterwards, you can run the training without any +MALA logging data. You can also select which metrics to record via

    +
    +
    parameters.validation_metrics = ["ldos", "dos", "density", "total_energy"]
    +
    +
    +
    +
    +
    Full list of available metrics:
      +
    • “ldos”: MSE of the LDOS.

    • +
    • “band_energy”: Band energy.

    • +
    • “band_energy_actual_fe”: Band energy computed with ground truth Fermi energy.

    • +
    • “total_energy”: Total energy.

    • +
    • “total_energy_actual_fe”: Total energy computed with ground truth Fermi energy.

    • +
    • “fermi_energy”: Fermi energy.

    • +
    • “density”: Electron density.

    • +
    • “density_relative”: Rlectron density (Mean Absolute Percentage Error).

    • +
    • “dos”: Density of states.

    • +
    • “dos_relative”: Density of states (Mean Absolute Percentage Error).

    • +
    +
    +
    +

    To save time and resources you can specify the logging interval via

    +
    +
    parameters.running.validate_every_n_epochs = 10
    +
    +
    +
    +

    If you want to monitor the degree to which the model overfits to the training data, +you can use the option

    +
    +
    parameters.running.validate_on_training_data = True
    +
    +
    +
    +

    MALA will evaluate the validation metrics on the training set as well as the validation set.

    +

    Afterwards, you can run the training without any other modifications. Once training is finished (or during training, in case you want to use tensorboard to monitor progress), you can launch tensorboard via

    @@ -305,6 +349,7 @@

    Using tensorboardpath_to_log_directory can be accessed via trainer.full_logging_path.

    +

    If you’re using wandb, you can monitor the training progress on the wandb website.

    Training in parallel

    diff --git a/api/mala.common.html b/api/mala.common.html index 1ba1a8179..5cb3d3ca1 100644 --- a/api/mala.common.html +++ b/api/mala.common.html @@ -245,8 +245,13 @@

    common<
  • ParametersRunning.use_shuffling_for_samplers
  • ParametersRunning.checkpoints_each_epoch
  • ParametersRunning.checkpoint_name
  • +
  • ParametersRunning.run_name
  • ParametersRunning.logging_dir
  • ParametersRunning.logging_dir_append_date
  • +
  • ParametersRunning.logger
  • +
  • ParametersRunning.validation_metrics
  • +
  • ParametersRunning.validate_on_training_data
  • +
  • ParametersRunning.validate_every_n_epochs
  • ParametersRunning.inference_data_grid
  • ParametersRunning.use_mixed_precision
  • ParametersRunning.training_log_interval
  • diff --git a/api/mala.common.parameters.html b/api/mala.common.parameters.html index ef62caae2..a8563751d 100644 --- a/api/mala.common.parameters.html +++ b/api/mala.common.parameters.html @@ -1070,15 +1070,15 @@
    nn_type
    -

    Type of the neural network that will be used. Currently supported are

    -
    -
      +
      +
      Type of the neural network that will be used. Currently supported are
      • “feed_forward” (default)

      • “transformer”

      • “lstm”

      • “gru”

      -
    +
    +
    Type:

    string

    @@ -1348,7 +1348,7 @@
    checkpoints_each_epoch
    -

    If not 0, checkpoint files will be saved after eac +

    If not 0, checkpoint files will be saved after each checkpoints_each_epoch epoch.

    Type:
    @@ -1369,6 +1369,17 @@
    +
    +
    +run_name
    +

    Name of the run used for logging.

    +
    +
    Type:
    +

    string

    +
    +
    +
    +
    logging_dir
    @@ -1393,6 +1404,72 @@
    +
    +
    +logger
    +

    Name of the logger to be used. +Currently supported are:

    +
    +
      +
    • “tensorboard”: Tensorboard logger.

    • +
    • “wandb”: Weights and Biases logger.

    • +
    +
    +
    +
    Type:
    +

    string

    +
    +
    +
    + +
    +
    +validation_metrics
    +

    List of metrics to be used for validation. Default is [“ldos”]. +Possible options are:

    +
    +
      +
    • “ldos”: MSE of the LDOS.

    • +
    • “band_energy”: Band energy.

    • +
    • “band_energy_actual_fe”: Band energy computed with ground truth Fermi energy.

    • +
    • “total_energy”: Total energy.

    • +
    • “total_energy_actual_fe”: Total energy computed with ground truth Fermi energy.

    • +
    • “fermi_energy”: Fermi energy.

    • +
    • “density”: Electron density.

    • +
    • “density_relative”: Rlectron density (MAPE).

    • +
    • “dos”: Density of states.

    • +
    • “dos_relative”: Density of states (MAPE).

    • +
    +
    +
    +
    Type:
    +

    list

    +
    +
    +
    + +
    +
    +validate_on_training_data
    +

    Whether to validate on the training data as well. Default is False.

    +
    +
    Type:
    +

    bool

    +
    +
    +
    + +
    +
    +validate_every_n_epochs
    +

    Determines how often validation is performed. Default is 1.

    +
    +
    Type:
    +

    int

    +
    +
    +
    +
    inference_data_grid
    @@ -1431,11 +1508,9 @@
    profiler_range
    -
    -
    List with two entries determining with which batch/iteration number

    the CUDA profiler will start and stop profiling. Please note that +

    List with two entries determining with which batch/iteration number +the CUDA profiler will start and stop profiling. Please note that this option only holds significance if the nsys profiler is used.

    -
    -
    Type:

    list

    diff --git a/api/mala.html b/api/mala.html index 83a0e7dc4..253372855 100644 --- a/api/mala.html +++ b/api/mala.html @@ -241,8 +241,13 @@

    mala<
  • ParametersRunning.use_shuffling_for_samplers
  • ParametersRunning.checkpoints_each_epoch
  • ParametersRunning.checkpoint_name
  • +
  • ParametersRunning.run_name
  • ParametersRunning.logging_dir
  • ParametersRunning.logging_dir_append_date
  • +
  • ParametersRunning.logger
  • +
  • ParametersRunning.validation_metrics
  • +
  • ParametersRunning.validate_on_training_data
  • +
  • ParametersRunning.validate_every_n_epochs
  • ParametersRunning.inference_data_grid
  • ParametersRunning.use_mixed_precision
  • ParametersRunning.training_log_interval
  • diff --git a/api/modules.html b/api/modules.html index b9120e8bf..f2c0b4d41 100644 --- a/api/modules.html +++ b/api/modules.html @@ -228,8 +228,13 @@

    API referenceParametersRunning.use_shuffling_for_samplers
  • ParametersRunning.checkpoints_each_epoch
  • ParametersRunning.checkpoint_name
  • +
  • ParametersRunning.run_name
  • ParametersRunning.logging_dir
  • ParametersRunning.logging_dir_append_date
  • +
  • ParametersRunning.logger
  • +
  • ParametersRunning.validation_metrics
  • +
  • ParametersRunning.validate_on_training_data
  • +
  • ParametersRunning.validate_every_n_epochs
  • ParametersRunning.inference_data_grid
  • ParametersRunning.use_mixed_precision
  • ParametersRunning.training_log_interval
  • diff --git a/genindex.html b/genindex.html index 2188f0615..2442a8174 100644 --- a/genindex.html +++ b/genindex.html @@ -788,6 +788,8 @@

    L

  • local_psp_name (ParametersDataGeneration attribute)
  • local_psp_path (ParametersDataGeneration attribute) +
  • +
  • logger (ParametersRunning attribute)
  • logging_dir (ParametersRunning attribute)
  • @@ -1589,10 +1591,10 @@

    R

  • read_from_qe_dos_txt() (DOS method)
  • - - +