Skip to content

Commit

Permalink
Merge pull request #449 from franzpoeschel/openpmd-ci
Browse files Browse the repository at this point in the history
CI for openPMD
RandomDefaultUser authored Jun 6, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 974126a + 3736a88 commit 83f03be
Showing 16 changed files with 331 additions and 23 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/cpu-tests.yml
Original file line number Diff line number Diff line change
@@ -172,8 +172,8 @@ jobs:
with:
repository: mala-project/test-data
path: mala_data
ref: v1.6.0
lfs: false
ref: v1.7.0
lfs: true

- name: Test mala
shell: 'bash -c "docker exec -i mala-cpu bash < {0}"'
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@ RUN conda env create -f mala_${DEVICE}_environment.yml && rm -rf /opt/conda/pkgs
RUN /opt/conda/envs/mala-${DEVICE}/bin/pip install --no-input --no-cache-dir \
pytest \
oapackage==2.6.8 \
openpmd-api==0.14.5 \
openpmd-api==0.15.1 \
pqkmeans

RUN echo "source activate mala-${DEVICE}" > ~/.bashrc
10 changes: 9 additions & 1 deletion docs/source/usage/preprocessing.rst
Original file line number Diff line number Diff line change
@@ -32,7 +32,15 @@ MALA can be used to process raw data into ready-to-use data fro the surrogate mo
For this, the ``DataConverter`` class can be used; see example ``ex02_preprocess_data``.
If you are not sure which descriptor hyperparameters to use (e.g.: "Which cutoff
radius do I need?") MALA provides a fast analysis that does not involve
model tuning. See ``ex13_acsd``
model tuning. See ``ex13_acsd``.

By default, MALA saves its data files to numpy ``.npy`` files. However, for
storing large amounts of volumetric data (plus metadata), libraries such as
`OpenPMD <https://github.com/openPMD/openPMD-api>`_ are more suitable.
MALA provides a full OpenPMD interface that is currently tested in production.
We recommend usage of the OpenPMD interface, which will become the new default
in upcoming versions. Examples related to data processing and general workflow
usage include lines that showcase the usage of OpenPMD within MALA.

Using input and output data
###########################
8 changes: 8 additions & 0 deletions examples/ex01_train_network.py
Original file line number Diff line number Diff line change
@@ -51,6 +51,14 @@
"Be_snapshot0.out.npy", data_path, "tr")
data_handler.add_snapshot("Be_snapshot1.in.npy", data_path,
"Be_snapshot1.out.npy", data_path, "va")
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_handler.add_snapshot("Be_snapshot0.in.h5", data_path,
# "Be_snapshot0.out.h5", data_path, "tr",
# snapshot_type="openpmd")
# data_handler.add_snapshot("Be_snapshot1.in.h5", data_path,
# "Be_snapshot1.out.h5", data_path, "va",
# snapshot_type="openpmd")
data_handler.prepare_data()
printout("Read data: DONE.")

13 changes: 13 additions & 0 deletions examples/ex02_test_network.py
Original file line number Diff line number Diff line change
@@ -44,6 +44,19 @@
"Be_snapshot3.out.npy", data_path, "te",
calculation_output_file=
os.path.join(data_path, "Be_snapshot3.out"))
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_handler.add_snapshot("Be_snapshot2.in.h5", data_path,
# "Be_snapshot2.out.h5", data_path, "te",
# calculation_output_file=
# os.path.join(data_path, "Be_snapshot2.out"),
# snapshot_type="openpmd")
# data_handler.add_snapshot("Be_snapshot3.in.h5", data_path,
# "Be_snapshot3.out.h5", data_path, "te",
# calculation_output_file=
# os.path.join(data_path, "Be_snapshot3.out"),
# snapshot_type="openpmd")

data_handler.prepare_data(reparametrize_scaler=False)


6 changes: 6 additions & 0 deletions examples/ex03_preprocess_data.py
Original file line number Diff line number Diff line change
@@ -74,6 +74,12 @@
target_save_path="./",
additional_info_save_path="./",
naming_scheme="Be_snapshot*.npy")
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_converter.convert_snapshots(descriptor_save_path="./",
# target_save_path="./",
# additional_info_save_path="./",
# naming_scheme="Be_snapshot*.h5")

# If parts of the data have already been processed, the DataConverter class can
# also be used to convert the rest.
5 changes: 5 additions & 0 deletions examples/ex04_postprocess_data.py
Original file line number Diff line number Diff line change
@@ -50,6 +50,11 @@
ldos = mala.LDOS.from_numpy_file(test_parameters,
os.path.join(data_path,
"Be_snapshot0.out.npy"))
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# ldos = mala.LDOS.from_numpy_file(test_parameters,
# os.path.join(data_path,
# "Be_snapshot0.out.h5"))

# Read additional information about the calculation.
# By doing this, the calculator is able to know e.g. the temperature
12 changes: 12 additions & 0 deletions examples/ex18_shuffle_data.py
Original file line number Diff line number Diff line change
@@ -36,11 +36,23 @@
"Be_snapshot0.out.npy", data_path)
data_shuffler.add_snapshot("Be_snapshot1.in.npy", data_path,
"Be_snapshot1.out.npy", data_path)
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
# "Be_snapshot0.out.h5", data_path,
# snapshot_type="openpmd")
# data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
# "Be_snapshot1.out.h5", data_path,
# snapshot_type="openpmd")

# After shuffling, these snapshots can be loaded as regular snapshots for
# lazily loaded training. Both OpenPMD and numpy can be used as save format
# for data.
data_shuffler.shuffle_snapshots(complete_save_path="./",
save_name="Be_shuffled*")
# New feature: You can switch the lines above for these to use the new,
# more powerful OpenPMD interface for MALA!
# data_shuffler.shuffle_snapshots(complete_save_path="./",
# save_name="Be_shuffled*.h5")


17 changes: 12 additions & 5 deletions mala/datahandling/data_converter.py
Original file line number Diff line number Diff line change
@@ -502,11 +502,18 @@ def __convert_single_snapshot(self, snapshot_number,
self.descriptor_calculator.\
write_to_numpy_file(input_path, tmp_input)
else:
tmp_input, local_offset, local_reach = \
self.descriptor_calculator.convert_local_to_3d(tmp_input)
self.descriptor_calculator.\
write_to_openpmd_iteration(input_iteration,
tmp_input, local_offset=local_offset, local_reach=local_reach)
if self.parameters._configuration["mpi"]:
tmp_input, local_offset, local_reach = \
self.descriptor_calculator.convert_local_to_3d(tmp_input)
self.descriptor_calculator. \
write_to_openpmd_iteration(input_iteration,
tmp_input,
local_offset=local_offset,
local_reach=local_reach)
else:
self.descriptor_calculator. \
write_to_openpmd_iteration(input_iteration,
tmp_input)
del tmp_input

###########
9 changes: 5 additions & 4 deletions mala/datahandling/data_shuffler.py
Original file line number Diff line number Diff line change
@@ -145,8 +145,10 @@ def __shuffle_numpy(self, number_of_new_snapshots, shuffle_dimensions,
new_targets)
else:
# We check above that in the non-numpy case, OpenPMD will work.
self.descriptor_calculator.grid_dimensions = shuffle_dimensions
self.target_calculator.grid_dimensions = shuffle_dimensions
self.descriptor_calculator.grid_dimensions = \
list(shuffle_dimensions)
self.target_calculator.grid_dimensions = \
list(shuffle_dimensions)
self.descriptor_calculator.\
write_to_openpmd_file(descriptor_name+".in."+file_ending,
new_descriptors,
@@ -265,7 +267,7 @@ def from_chunk_i(i, n, dset, slice_dimension=0):
# Do the actual shuffling.
for i in range(my_items_start, my_items_end):
# We check above that in the non-numpy case, OpenPMD will work.
dot.calculator.grid_dimensions = shuffle_dimensions
dot.calculator.grid_dimensions = list(shuffle_dimensions)
name_prefix = os.path.join(dot.save_path,
save_name.replace("*", str(i)))
# do NOT open with MPI
@@ -326,7 +328,6 @@ def from_chunk_i(i, n, dset, slice_dimension=0):
for series in input_series_list:
series.close()


def shuffle_snapshots(self,
complete_save_path=None,
descriptor_save_path=None,
2 changes: 1 addition & 1 deletion mala/targets/density.py
Original file line number Diff line number Diff line change
@@ -396,7 +396,7 @@ def read_from_cube(self, path, units="1/A^3", **kwargs):
data, meta = read_cube(path)
data *= self.convert_units(1, in_units=units)
self.density = data
self.grid_dimensions = np.shape(data)[0:3]
self.grid_dimensions = list(np.shape(data)[0:3])
return data

def read_from_xsf(self, path, units="1/A^3", **kwargs):
2 changes: 1 addition & 1 deletion mala/targets/ldos.py
Original file line number Diff line number Diff line change
@@ -1475,7 +1475,7 @@ def _read_from_qe_files(self, path_scheme, units,
# Convert and then append the LDOS data.
data = data*self.convert_units(1, in_units=units)
ldos_data[:, :, :, i-start_index] = data[:, :, :]
self.grid_dimensions = np.shape(ldos_data)[0:3]
self.grid_dimensions = list(np.shape(ldos_data)[0:3])

# We have to gather the LDOS either file based or not.
if self.parameters._configuration["mpi"]:
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -8,3 +8,4 @@ optuna
scipy
pandas
tensorboard
openpmd-api>=0.15
2 changes: 1 addition & 1 deletion test/clean.sh
Original file line number Diff line number Diff line change
@@ -2,4 +2,4 @@

# Remove artifact files that some example scripts write.

rm -rv *.pth *.pkl ex09.db *.pw* __pycache__ *.cube ex10_vis *.tmp *.npy *.json
rm -rv *.pth *.pkl ex09.db *.pw* __pycache__ *.cube ex10_vis *.tmp *.npy *.json *.h5 *.bp
145 changes: 145 additions & 0 deletions test/shuffling_test.py
Original file line number Diff line number Diff line change
@@ -31,6 +31,7 @@ def test_seed(self):
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*")

test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

@@ -48,6 +49,53 @@ def test_seed(self):
new = np.load("Be_REshuffled1.out.npy")
assert np.isclose(np.sum(np.abs(old-new)), 0.0, atol=accuracy)

def test_seed_openpmd(self):
"""
Test that the shuffling is handled correctly internally.
This function tests the shuffling for OpenPMD and confirms that
shuffling both from numpy and openpmd into openpmd always gives the
same results. The first shuffling shuffles from openpmd to openpmd
format, the second from numpy to openpmd.
"""
test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
"Be_snapshot0.out.h5", data_path,
snapshot_type="openpmd")
data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
"Be_snapshot1.out.h5", data_path,
snapshot_type="openpmd")

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")

test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot("Be_snapshot0.in.npy", data_path,
"Be_snapshot0.out.npy", data_path,
snapshot_type="numpy")
data_shuffler.add_snapshot("Be_snapshot1.in.npy", data_path,
"Be_snapshot1.out.npy", data_path,
snapshot_type="numpy")

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5")

old = data_shuffler.target_calculator.\
read_from_openpmd_file("Be_shuffled1.out.h5")
new = data_shuffler.target_calculator.\
read_from_openpmd_file("Be_REshuffled1.out.h5")
assert np.isclose(np.sum(np.abs(old-new)), 0.0, atol=accuracy)

def test_training(self):
test_parameters = mala.Parameters()
test_parameters.data.data_splitting_type = "by_snapshot"
@@ -79,7 +127,18 @@ def test_training(self):
old_loss = test_trainer.final_validation_loss

# Shuffle.
test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
test_parameters.data.data_splitting_type = "by_snapshot"
test_parameters.data.input_rescaling_type = "feature-wise-standard"
test_parameters.data.output_rescaling_type = "normal"
test_parameters.network.layer_activations = ["ReLU"]
test_parameters.running.max_number_epochs = 50
test_parameters.running.mini_batch_size = 40
test_parameters.running.learning_rate = 0.00001
test_parameters.running.trainingtype = "Adam"
test_parameters.verbosity = 1
test_parameters.data.use_lazy_loading = True
data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
@@ -102,6 +161,92 @@ def test_training(self):
"Be_shuffled1.out.npy", ".", "va")
data_handler.prepare_data()

test_parameters.network.layer_sizes = [data_handler.input_dimension,
100,
data_handler.output_dimension]

test_network = mala.Network(test_parameters)
test_trainer = mala.Trainer(test_parameters, test_network,
data_handler)
test_trainer.train_network()
new_loss = test_trainer.final_validation_loss
assert old_loss > new_loss

def test_training_openpmd(self):
test_parameters = mala.Parameters()
test_parameters.data.data_splitting_type = "by_snapshot"
test_parameters.data.input_rescaling_type = "feature-wise-standard"
test_parameters.data.output_rescaling_type = "normal"
test_parameters.network.layer_activations = ["ReLU"]
test_parameters.running.max_number_epochs = 50
test_parameters.running.mini_batch_size = 40
test_parameters.running.learning_rate = 0.00001
test_parameters.running.trainingtype = "Adam"
test_parameters.verbosity = 1
test_parameters.data.use_lazy_loading = True

# Train without shuffling.
data_handler = mala.DataHandler(test_parameters)
data_handler.add_snapshot("Be_snapshot0.in.h5", data_path,
"Be_snapshot0.out.h5", data_path, "tr",
snapshot_type="openpmd")
data_handler.add_snapshot("Be_snapshot1.in.h5", data_path,
"Be_snapshot1.out.h5", data_path, "va",
snapshot_type="openpmd")
data_handler.prepare_data()

test_parameters.network.layer_sizes = [data_handler.input_dimension,
100,
data_handler.output_dimension]
test_network = mala.Network(test_parameters)
test_trainer = mala.Trainer(test_parameters, test_network,
data_handler)
test_trainer.train_network()
old_loss = test_trainer.final_validation_loss

# Shuffle.
test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
test_parameters.data.data_splitting_type = "by_snapshot"
test_parameters.data.input_rescaling_type = "feature-wise-standard"
test_parameters.data.output_rescaling_type = "normal"
test_parameters.network.layer_activations = ["ReLU"]
test_parameters.running.max_number_epochs = 50
test_parameters.running.mini_batch_size = 40
test_parameters.running.learning_rate = 0.00001
test_parameters.running.trainingtype = "Adam"
test_parameters.verbosity = 1
test_parameters.data.use_lazy_loading = True

data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot("Be_snapshot0.in.h5", data_path,
"Be_snapshot0.out.h5", data_path,
snapshot_type="openpmd")
data_shuffler.add_snapshot("Be_snapshot1.in.h5", data_path,
"Be_snapshot1.out.h5", data_path,
snapshot_type="openpmd")

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")
test_parameters.descriptors.descriptors_contain_xyz = True

# Train with shuffling.
data_handler = mala.DataHandler(test_parameters)
# Add a snapshot we want to use in to the list.
data_handler.add_snapshot("Be_shuffled0.in.h5", ".",
"Be_shuffled0.out.h5", ".", "tr",
snapshot_type="openpmd")
data_handler.add_snapshot("Be_shuffled1.in.h5", ".",
"Be_shuffled1.out.h5", ".", "va",
snapshot_type="openpmd")
data_handler.prepare_data()
test_parameters.network.layer_sizes = [data_handler.input_dimension,
100,
data_handler.output_dimension]

test_network = mala.Network(test_parameters)
test_trainer = mala.Trainer(test_parameters, test_network,
data_handler)
Loading

0 comments on commit 83f03be

Please sign in to comment.