Merge pull request #449 from franzpoeschel/openpmd-ci

CI for openPMD
mala-project · Jun 6, 2023 · 83f03be · 83f03be
2 parents 974126a + 3736a88
commit 83f03be
Showing 16 changed files with 331 additions and 23 deletions.
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
@@ -172,8 +172,8 @@ jobs:
         with:
           repository: mala-project/test-data
           path: mala_data
-          ref: v1.6.0
-          lfs: false
+          ref: v1.7.0
+          lfs: true
 
       - name: Test mala
         shell: 'bash -c "docker exec -i mala-cpu bash < {0}"'

diff --git a/Dockerfile b/Dockerfile
@@ -20,7 +20,7 @@ RUN conda env create -f mala_${DEVICE}_environment.yml && rm -rf /opt/conda/pkgs
 RUN /opt/conda/envs/mala-${DEVICE}/bin/pip install --no-input --no-cache-dir \
     pytest \
     oapackage==2.6.8 \
-    openpmd-api==0.14.5 \
+    openpmd-api==0.15.1 \
     pqkmeans
 
 RUN echo "source activate mala-${DEVICE}" > ~/.bashrc

diff --git a/docs/source/usage/preprocessing.rst b/docs/source/usage/preprocessing.rst
@@ -32,7 +32,15 @@ MALA can be used to process raw data into ready-to-use data fro the surrogate mo
 For this, the ``DataConverter`` class can be used; see example ``ex02_preprocess_data``.
 If you are not sure which descriptor hyperparameters to use (e.g.: "Which cutoff
 radius do I need?") MALA provides a fast analysis that does not involve
-model tuning. See ``ex13_acsd``
+model tuning. See ``ex13_acsd``.
+
+By default, MALA saves its data files to numpy ``.npy`` files. However, for
+storing large amounts of volumetric data (plus metadata), libraries such as
+`OpenPMD <https://github.com/openPMD/openPMD-api>`_ are more suitable.
+MALA provides a full OpenPMD interface that is currently tested in production.
+We recommend usage of the OpenPMD interface, which will become the new default
+in upcoming versions. Examples related to data processing and general workflow
+usage include lines that showcase the usage of OpenPMD within MALA.
 
 Using input and output data
 ###########################

diff --git a/examples/ex01_train_network.py b/examples/ex01_train_network.py
@@ -51,6 +51,14 @@
                           "Be_snapshot0.out.npy", data_path, "tr")
 data_handler.add_snapshot("Be_snapshot1.in.npy", data_path,
                           "Be_snapshot1.out.npy", data_path, "va")
+# New feature: You can switch the lines above for these to use the new,
+# more powerful OpenPMD interface for MALA!
+# data_handler.add_snapshot("Be_snapshot0.in.h5", data_path,
+#                           "Be_snapshot0.out.h5", data_path, "tr",
+#                           snapshot_type="openpmd")
+# data_handler.add_snapshot("Be_snapshot1.in.h5", data_path,
+#                           "Be_snapshot1.out.h5", data_path, "va",
+#                           snapshot_type="openpmd")
 data_handler.prepare_data()
 printout("Read data: DONE.")
 

diff --git a/examples/ex02_test_network.py b/examples/ex02_test_network.py
@@ -44,6 +44,19 @@
                           "Be_snapshot3.out.npy", data_path, "te",
                           calculation_output_file=
                           os.path.join(data_path, "Be_snapshot3.out"))
+# New feature: You can switch the lines above for these to use the new,
+# more powerful OpenPMD interface for MALA!
+# data_handler.add_snapshot("Be_snapshot2.in.h5", data_path,
+#                           "Be_snapshot2.out.h5", data_path, "te",
+#                           calculation_output_file=
+#                           os.path.join(data_path, "Be_snapshot2.out"),
+#                           snapshot_type="openpmd")
+# data_handler.add_snapshot("Be_snapshot3.in.h5", data_path,
+#                           "Be_snapshot3.out.h5", data_path, "te",
+#                           calculation_output_file=
+#                           os.path.join(data_path, "Be_snapshot3.out"),
+#                           snapshot_type="openpmd")
+
 data_handler.prepare_data(reparametrize_scaler=False)
 
 

diff --git a/examples/ex03_preprocess_data.py b/examples/ex03_preprocess_data.py
@@ -74,6 +74,12 @@
                                  target_save_path="./",
                                  additional_info_save_path="./",
                                  naming_scheme="Be_snapshot*.npy")
+# New feature: You can switch the lines above for these to use the new,
+# more powerful OpenPMD interface for MALA!
+# data_converter.convert_snapshots(descriptor_save_path="./",
+#                                  target_save_path="./",
+#                                  additional_info_save_path="./",
+#                                  naming_scheme="Be_snapshot*.h5")
 
 # If parts of the data have already been processed, the DataConverter class can
 # also be used to convert the rest.

diff --git a/examples/ex04_postprocess_data.py b/examples/ex04_postprocess_data.py
@@ -50,6 +50,11 @@
 ldos = mala.LDOS.from_numpy_file(test_parameters,
                                  os.path.join(data_path,
                                               "Be_snapshot0.out.npy"))
+# New feature: You can switch the lines above for these to use the new,
+# more powerful OpenPMD interface for MALA!
+# ldos = mala.LDOS.from_numpy_file(test_parameters,
+#                                  os.path.join(data_path,
+#                                               "Be_snapshot0.out.h5"))
 
 # Read additional information about the calculation.
 # By doing this, the calculator is able to know e.g. the temperature

diff --git a/examples/ex18_shuffle_data.py b/examples/ex18_shuffle_data.py
@@ -36,11 +36,23 @@
                            "Be_snapshot0.out.npy", data_path)
 data_shuffler.add_snapshot("Be_snapshot1.in.npy", data_path,
                            "Be_snapshot1.out.npy", data_path)
+# New feature: You can switch the lines above for these to use the new,
+# more powerful OpenPMD interface for MALA!
+# data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
+#                            "Be_snapshot0.out.h5", data_path,
+#                            snapshot_type="openpmd")
+# data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
+#                            "Be_snapshot1.out.h5", data_path,
+#                            snapshot_type="openpmd")
 
 # After shuffling, these snapshots can be loaded as regular snapshots for
 # lazily loaded training. Both OpenPMD and numpy can be used as save format
 # for data.
 data_shuffler.shuffle_snapshots(complete_save_path="./",
                                 save_name="Be_shuffled*")
+# New feature: You can switch the lines above for these to use the new,
+# more powerful OpenPMD interface for MALA!
+# data_shuffler.shuffle_snapshots(complete_save_path="./",
+#                                 save_name="Be_shuffled*.h5")
 
 
diff --git a/mala/datahandling/data_converter.py b/mala/datahandling/data_converter.py
@@ -502,11 +502,18 @@ def __convert_single_snapshot(self, snapshot_number,
                     self.descriptor_calculator.\
                         write_to_numpy_file(input_path, tmp_input)
             else:
-                tmp_input, local_offset, local_reach = \
-                    self.descriptor_calculator.convert_local_to_3d(tmp_input)
-                self.descriptor_calculator.\
-                    write_to_openpmd_iteration(input_iteration,
-                                               tmp_input, local_offset=local_offset, local_reach=local_reach)
+                if self.parameters._configuration["mpi"]:
+                    tmp_input, local_offset, local_reach = \
+                        self.descriptor_calculator.convert_local_to_3d(tmp_input)
+                    self.descriptor_calculator. \
+                        write_to_openpmd_iteration(input_iteration,
+                                                   tmp_input,
+                                                   local_offset=local_offset,
+                                                   local_reach=local_reach)
+                else:
+                    self.descriptor_calculator. \
+                        write_to_openpmd_iteration(input_iteration,
+                                                   tmp_input)
             del tmp_input
 
         ###########

diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
@@ -145,8 +145,10 @@ def __shuffle_numpy(self, number_of_new_snapshots, shuffle_dimensions,
                                         new_targets)
             else:
                 # We check above that in the non-numpy case, OpenPMD will work.
-                self.descriptor_calculator.grid_dimensions = shuffle_dimensions
-                self.target_calculator.grid_dimensions = shuffle_dimensions
+                self.descriptor_calculator.grid_dimensions = \
+                    list(shuffle_dimensions)
+                self.target_calculator.grid_dimensions = \
+                    list(shuffle_dimensions)
                 self.descriptor_calculator.\
                     write_to_openpmd_file(descriptor_name+".in."+file_ending,
                                           new_descriptors,
@@ -265,7 +267,7 @@ def from_chunk_i(i, n, dset, slice_dimension=0):
         # Do the actual shuffling.
         for i in range(my_items_start, my_items_end):
             # We check above that in the non-numpy case, OpenPMD will work.
-            dot.calculator.grid_dimensions = shuffle_dimensions
+            dot.calculator.grid_dimensions = list(shuffle_dimensions)
             name_prefix = os.path.join(dot.save_path,
                                        save_name.replace("*", str(i)))
             # do NOT open with MPI
@@ -326,7 +328,6 @@ def from_chunk_i(i, n, dset, slice_dimension=0):
         for series in input_series_list:
             series.close()
 
-
     def shuffle_snapshots(self,
                           complete_save_path=None,
                           descriptor_save_path=None,

diff --git a/mala/targets/density.py b/mala/targets/density.py
@@ -396,7 +396,7 @@ def read_from_cube(self, path, units="1/A^3", **kwargs):
         data, meta = read_cube(path)
         data *= self.convert_units(1, in_units=units)
         self.density = data
-        self.grid_dimensions = np.shape(data)[0:3]
+        self.grid_dimensions = list(np.shape(data)[0:3])
         return data
 
     def read_from_xsf(self, path, units="1/A^3", **kwargs):

diff --git a/mala/targets/ldos.py b/mala/targets/ldos.py
@@ -1475,7 +1475,7 @@ def _read_from_qe_files(self, path_scheme, units,
             # Convert and then append the LDOS data.
             data = data*self.convert_units(1, in_units=units)
             ldos_data[:, :, :, i-start_index] = data[:, :, :]
-            self.grid_dimensions = np.shape(ldos_data)[0:3]
+            self.grid_dimensions = list(np.shape(ldos_data)[0:3])
 
         # We have to gather the LDOS either file based or not.
         if self.parameters._configuration["mpi"]:

diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ optuna
 scipy
 pandas
 tensorboard
+openpmd-api>=0.15
diff --git a/test/clean.sh b/test/clean.sh
@@ -2,4 +2,4 @@
 
 # Remove artifact files that some example scripts write.
 
-rm -rv *.pth *.pkl ex09.db *.pw* __pycache__ *.cube ex10_vis *.tmp *.npy *.json
+rm -rv *.pth *.pkl ex09.db *.pw* __pycache__ *.cube ex10_vis *.tmp *.npy *.json *.h5 *.bp
diff --git a/test/shuffling_test.py b/test/shuffling_test.py
@@ -31,6 +31,7 @@ def test_seed(self):
         # for lazily loaded training-
         data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*")
 
+        test_parameters = mala.Parameters()
         test_parameters.data.shuffling_seed = 1234
         data_shuffler = mala.DataShuffler(test_parameters)
 
@@ -48,6 +49,53 @@ def test_seed(self):
         new = np.load("Be_REshuffled1.out.npy")
         assert np.isclose(np.sum(np.abs(old-new)), 0.0, atol=accuracy)
 
+    def test_seed_openpmd(self):
+        """
+        Test that the shuffling is handled correctly internally.
+
+        This function tests the shuffling for OpenPMD and confirms that
+        shuffling both from numpy and openpmd into openpmd always gives the
+        same results. The first shuffling shuffles from openpmd to openpmd
+        format, the second from numpy to openpmd.
+        """
+        test_parameters = mala.Parameters()
+        test_parameters.data.shuffling_seed = 1234
+        data_shuffler = mala.DataShuffler(test_parameters)
+
+        # Add a snapshot we want to use in to the list.
+        data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
+                                   "Be_snapshot0.out.h5", data_path,
+                                   snapshot_type="openpmd")
+        data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
+                                   "Be_snapshot1.out.h5", data_path,
+                                   snapshot_type="openpmd")
+
+        # After shuffling, these snapshots can be loaded as regular snapshots
+        # for lazily loaded training-
+        data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")
+
+        test_parameters = mala.Parameters()
+        test_parameters.data.shuffling_seed = 1234
+        data_shuffler = mala.DataShuffler(test_parameters)
+
+        # Add a snapshot we want to use in to the list.
+        data_shuffler.add_snapshot("Be_snapshot0.in.npy", data_path,
+                                   "Be_snapshot0.out.npy", data_path,
+                                   snapshot_type="numpy")
+        data_shuffler.add_snapshot("Be_snapshot1.in.npy", data_path,
+                                   "Be_snapshot1.out.npy", data_path,
+                                   snapshot_type="numpy")
+
+        # After shuffling, these snapshots can be loaded as regular snapshots
+        # for lazily loaded training-
+        data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5")
+
+        old = data_shuffler.target_calculator.\
+            read_from_openpmd_file("Be_shuffled1.out.h5")
+        new = data_shuffler.target_calculator.\
+            read_from_openpmd_file("Be_REshuffled1.out.h5")
+        assert np.isclose(np.sum(np.abs(old-new)), 0.0, atol=accuracy)
+
     def test_training(self):
         test_parameters = mala.Parameters()
         test_parameters.data.data_splitting_type = "by_snapshot"
@@ -79,7 +127,18 @@ def test_training(self):
         old_loss = test_trainer.final_validation_loss
 
         # Shuffle.
+        test_parameters = mala.Parameters()
         test_parameters.data.shuffling_seed = 1234
+        test_parameters.data.data_splitting_type = "by_snapshot"
+        test_parameters.data.input_rescaling_type = "feature-wise-standard"
+        test_parameters.data.output_rescaling_type = "normal"
+        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.running.max_number_epochs = 50
+        test_parameters.running.mini_batch_size = 40
+        test_parameters.running.learning_rate = 0.00001
+        test_parameters.running.trainingtype = "Adam"
+        test_parameters.verbosity = 1
+        test_parameters.data.use_lazy_loading = True
         data_shuffler = mala.DataShuffler(test_parameters)
 
         # Add a snapshot we want to use in to the list.
@@ -102,6 +161,92 @@ def test_training(self):
                                   "Be_shuffled1.out.npy", ".", "va")
         data_handler.prepare_data()
 
+        test_parameters.network.layer_sizes = [data_handler.input_dimension,
+                                               100,
+                                               data_handler.output_dimension]
+
+        test_network = mala.Network(test_parameters)
+        test_trainer = mala.Trainer(test_parameters, test_network,
+                                    data_handler)
+        test_trainer.train_network()
+        new_loss = test_trainer.final_validation_loss
+        assert old_loss > new_loss
+
+    def test_training_openpmd(self):
+        test_parameters = mala.Parameters()
+        test_parameters.data.data_splitting_type = "by_snapshot"
+        test_parameters.data.input_rescaling_type = "feature-wise-standard"
+        test_parameters.data.output_rescaling_type = "normal"
+        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.running.max_number_epochs = 50
+        test_parameters.running.mini_batch_size = 40
+        test_parameters.running.learning_rate = 0.00001
+        test_parameters.running.trainingtype = "Adam"
+        test_parameters.verbosity = 1
+        test_parameters.data.use_lazy_loading = True
+
+        # Train without shuffling.
+        data_handler = mala.DataHandler(test_parameters)
+        data_handler.add_snapshot("Be_snapshot0.in.h5", data_path,
+                                  "Be_snapshot0.out.h5", data_path, "tr",
+                                  snapshot_type="openpmd")
+        data_handler.add_snapshot("Be_snapshot1.in.h5", data_path,
+                                  "Be_snapshot1.out.h5", data_path, "va",
+                                  snapshot_type="openpmd")
+        data_handler.prepare_data()
+
+        test_parameters.network.layer_sizes = [data_handler.input_dimension,
+                                               100,
+                                               data_handler.output_dimension]
+        test_network = mala.Network(test_parameters)
+        test_trainer = mala.Trainer(test_parameters, test_network,
+                                    data_handler)
+        test_trainer.train_network()
+        old_loss = test_trainer.final_validation_loss
+
+        # Shuffle.
+        test_parameters = mala.Parameters()
+        test_parameters.data.shuffling_seed = 1234
+        test_parameters.data.data_splitting_type = "by_snapshot"
+        test_parameters.data.input_rescaling_type = "feature-wise-standard"
+        test_parameters.data.output_rescaling_type = "normal"
+        test_parameters.network.layer_activations = ["ReLU"]
+        test_parameters.running.max_number_epochs = 50
+        test_parameters.running.mini_batch_size = 40
+        test_parameters.running.learning_rate = 0.00001
+        test_parameters.running.trainingtype = "Adam"
+        test_parameters.verbosity = 1
+        test_parameters.data.use_lazy_loading = True
+
+        data_shuffler = mala.DataShuffler(test_parameters)
+
+        # Add a snapshot we want to use in to the list.
+        data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
+                                   "Be_snapshot0.out.h5", data_path,
+                                   snapshot_type="openpmd")
+        data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
+                                   "Be_snapshot1.out.h5", data_path,
+                                   snapshot_type="openpmd")
+
+        # After shuffling, these snapshots can be loaded as regular snapshots
+        # for lazily loaded training-
+        data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")
+        test_parameters.descriptors.descriptors_contain_xyz = True
+
+        # Train with shuffling.
+        data_handler = mala.DataHandler(test_parameters)
+        # Add a snapshot we want to use in to the list.
+        data_handler.add_snapshot("Be_shuffled0.in.h5", ".",
+                                  "Be_shuffled0.out.h5", ".", "tr",
+                                  snapshot_type="openpmd")
+        data_handler.add_snapshot("Be_shuffled1.in.h5", ".",
+                                  "Be_shuffled1.out.h5", ".", "va",
+                                  snapshot_type="openpmd")
+        data_handler.prepare_data()
+        test_parameters.network.layer_sizes = [data_handler.input_dimension,
+                                               100,
+                                               data_handler.output_dimension]
+
         test_network = mala.Network(test_parameters)
         test_trainer = mala.Trainer(test_parameters, test_network,
                                     data_handler)
-Original file line number
+Diff line change
@@ @@ -8,3 +8,4 @@ optuna @@
     scipy
     pandas
     tensorboard
+    openpmd-api>=0.15
Original file line number	Diff line number	Diff line change
		@@ -2,4 +2,4 @@

		# Remove artifact files that some example scripts write.

		rm -rv .pth .pkl ex09.db .pw __pycache__ .cube ex10_vis .tmp .npy .json
		rm -rv .pth .pkl ex09.db .pw __pycache__ .cube ex10_vis .tmp .npy .json .h5 .bp