Merge pull request #505 from RandomDefaultUser/parallel_gpu_inference

Added parallel GPU inference
mala-project · Feb 22, 2024 · dd02e78 · dd02e78
2 parents 59f67ff + df407d7
commit dd02e78
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 16 deletions.
diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py
@@ -527,13 +527,7 @@ def _setup_lammps(self, nx, ny, nz, outdir, lammps_dict,
         from lammps import lammps
 
         parallel_warn("Do not initialize more than one pre-processing calculation\
-in the same directory at the same time. Data may be over-written.")
-
-        if self.parameters._configuration["mpi"] and \
-           self.parameters._configuration["gpu"]:
-            raise Exception("LAMMPS can currently only work with multiple "
-                            "ranks or GPU on one rank - but not multiple GPUs "
-                            "across ranks.")
+        in the same directory at the same time. Data may be over-written.")
 
         # Build LAMMPS arguments from the data we read.
         lmp_cmdargs = ["-screen", "none", "-log",

diff --git a/mala/network/network.py b/mala/network/network.py
@@ -190,12 +190,8 @@ def load_from_file(cls, params, file):
             The network that was loaded from the file.
         """
         loaded_network = Network(params)
-        if params.use_gpu:
-            loaded_network.load_state_dict(torch.load(file,
-                                                      map_location="cuda"))
-        else:
-            loaded_network.load_state_dict(torch.load(file,
-                                                      map_location="cpu"))
+        loaded_network.\
+            load_state_dict(torch.load(file, map_location=params.device))
         loaded_network.eval()
         return loaded_network
 

diff --git a/mala/network/predictor.py b/mala/network/predictor.py
@@ -187,6 +187,11 @@ def predict_for_atoms(self, atoms, gather_ldos=False, temperature=None):
     def _forward_snap_descriptors(self, snap_descriptors,
                                   local_data_size=None):
         """Forward a scaled tensor of descriptors through the NN."""
+        # Ensure the Network is on the correct device.
+        # This line is necessary because GPU acceleration may have been
+        # activated AFTER loading a model.
+        self.network.to(self.network.params._configuration["device"])
+
         if local_data_size is None:
             local_data_size = self.data.grid_size
         predicted_outputs = \

diff --git a/mala/network/runner.py b/mala/network/runner.py
@@ -114,7 +114,8 @@ def save_run(self, run_name, save_path="./", zip_run=True,
     @classmethod
     def load_run(cls, run_name, path="./", zip_run=True,
                  params_format="json", load_runner=True,
-                 prepare_data=False):
+                 prepare_data=False, load_with_mpi=None,
+                 load_with_gpu=None):
         """
         Load a run.
 
@@ -141,6 +142,23 @@ def load_run(cls, run_name, path="./", zip_run=True,
             If True, the data will be loaded into memory. This is needed when
             continuing a model training.
 
+        load_with_mpi : bool
+            Can be used to actively enable/disable MPI during loading.
+            Default is None, so that the MPI parameters set during
+            training/saving of the model are not overwritten.
+            If MPI is to be used in concert with GPU during training,
+            MPI already has to be activated here, if it was not activated
+            during training!
+
+        load_with_gpu : bool
+            Can be used to actively enable/disable GPU during loading.
+            Default is None, so that the GPU parameters set during
+            training/saving of the model are not overwritten.
+            If MPI is to be used in concert with GPU during training,
+            it is advised that GPU usage is activated here, if it was not
+            activated during training. Can also be used to activate a CPU
+            based inference, by setting it to False.
+
         Return
         ------
         loaded_params : mala.common.parameters.Parameters
@@ -183,6 +201,13 @@ def load_run(cls, run_name, path="./", zip_run=True,
                                          ".params."+params_format)
 
         loaded_params = Parameters.load_from_json(loaded_params)
+
+        # MPI has to be specified upon loading, in contrast to GPU.
+        if load_with_mpi is not None:
+            loaded_params.use_mpi = load_with_mpi
+        if load_with_gpu is not None:
+            loaded_params.use_gpu = load_with_gpu
+
         loaded_network = Network.load_from_file(loaded_params,
                                                 loaded_network)
         loaded_iscaler = DataScaler.load_from_file(loaded_iscaler)
@@ -283,6 +308,11 @@ def _forward_entire_snapshot(self, snapshot_number, data_set,
         predicted_outputs : numpy.ndarray
             Precicted outputs for snapshot.
         """
+        # Ensure the Network is on the correct device.
+        # This line is necessary because GPU acceleration may have been
+        # activated AFTER loading a model.
+        self.network.to(self.network.params._configuration["device"])
+
         # Determine where the snapshot begins and ends.
         from_index = 0
         to_index = None

diff --git a/mala/targets/density.py b/mala/targets/density.py
@@ -1046,10 +1046,21 @@ def __setup_total_energy_module(self, density_data, atoms_Angstrom,
             t0 = time.perf_counter()
             gaussian_descriptors = \
                 np.reshape(gaussian_descriptors,
-                           [number_of_gridpoints, 1], order='F')
+                           [number_of_gridpoints_mala, 1], order='F')
             reference_gaussian_descriptors = \
                 np.reshape(reference_gaussian_descriptors,
-                           [number_of_gridpoints, 1], order='F')
+                           [number_of_gridpoints_mala, 1], order='F')
+
+            # If there is an inconsistency between MALA and QE (which
+            # can only happen in the uneven z-splitting case at the moment)
+            # we need to pad the gaussian descriptor arrays.
+            if number_of_gridpoints_mala < number_of_gridpoints:
+                grid_diff = number_of_gridpoints - number_of_gridpoints_mala
+                gaussian_descriptors = np.pad(gaussian_descriptors,
+                                            pad_width=((0, grid_diff), (0, 0)))
+                reference_gaussian_descriptors = np.pad(reference_gaussian_descriptors,
+                                            pad_width=((0, grid_diff), (0, 0)))
+
             sigma = self._parameters_full.descriptors.\
                 atomic_density_sigma
             sigma = sigma / Bohr