Skip to content

Commit

Permalink
Merge pull request #607 from RandomDefaultUser/fix_shuffling_divisors
Browse files Browse the repository at this point in the history
Fixing tiny information loss in shuffling
  • Loading branch information
RandomDefaultUser authored Nov 19, 2024
2 parents e5ef826 + fc6e2ec commit a61a489
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 169 deletions.
179 changes: 74 additions & 105 deletions mala/datahandling/data_shuffler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
self.descriptor_calculator.parameters.descriptors_contain_xyz = (
False
)
self.data_points_to_remove = None

def add_snapshot(
self,
Expand Down Expand Up @@ -136,7 +137,11 @@ def __shuffle_numpy(
if self.data_points_to_remove is not None:
if self.parameters.shuffling_seed is not None:
np.random.seed(idx * self.parameters.shuffling_seed)
ngrid = descriptor_data[idx].shape[0]
ngrid = (
descriptor_data[idx].shape[0]
* descriptor_data[idx].shape[1]
* descriptor_data[idx].shape[2]
)
n_descriptor = descriptor_data[idx].shape[-1]
n_target = target_data[idx].shape[-1]

Expand All @@ -146,8 +151,8 @@ def __shuffle_numpy(
)

indices = np.random.choice(
ngrid**3,
size=ngrid**3 - self.data_points_to_remove[idx],
ngrid,
size=ngrid - self.data_points_to_remove[idx],
)

descriptor_data[idx] = current_descriptor[indices]
Expand Down Expand Up @@ -532,117 +537,81 @@ def shuffle_snapshots(
snapshot_type = snapshot_types.pop()
del snapshot_types

snapshot_size_list = [
snapshot.grid_size
for snapshot in self.parameters.snapshot_directories_list
]
# Set the defaults, these may be changed below as needed.
snapshot_size_list = np.array(
[
snapshot.grid_size
for snapshot in self.parameters.snapshot_directories_list
]
)
number_of_data_points = np.sum(snapshot_size_list)

self.data_points_to_remove = None

if number_of_shuffled_snapshots is None:
# If the user does not tell us how many snapshots to use,
# we have to check if the number of snapshots is straightforward.
# If all snapshots have the same size, we can just replicate the
# snapshot structure.
if np.max(snapshot_size_list) == np.min(snapshot_size_list):
shuffle_dimensions = self.parameters.snapshot_directories_list[
0
].grid_dimension
number_of_new_snapshots = self.nr_snapshots
else:
# If the snapshots have different sizes we simply create
# (x, 1, 1) snapshots big enough to hold the data.
number_of_new_snapshots = self.nr_snapshots
while number_of_data_points % number_of_new_snapshots != 0:
number_of_new_snapshots += 1
# If they do have different sizes, we start with the smallest
# snapshot, there is some padding down below anyhow.
shuffle_dimensions = [
int(number_of_data_points / number_of_new_snapshots),
1,
1,
]
number_of_shuffled_snapshots = self.nr_snapshots
number_of_new_snapshots = number_of_shuffled_snapshots

if snapshot_type == "openpmd":
import math
import functools

if snapshot_type == "openpmd":
import math
import functools

number_of_new_snapshots = functools.reduce(
math.gcd,
[
snapshot.grid_dimension[0]
for snapshot in self.parameters.snapshot_directories_list
],
number_of_new_snapshots,
specified_number_of_new_snapshots = number_of_new_snapshots
number_of_new_snapshots = functools.reduce(
math.gcd,
[
snapshot.grid_dimension[0]
for snapshot in self.parameters.snapshot_directories_list
],
number_of_new_snapshots,
)
if number_of_new_snapshots != specified_number_of_new_snapshots:
print(
f"[openPMD shuffling] Reduced the number of output snapshots to "
f"{number_of_new_snapshots} because of the dataset dimensions."
)
else:
number_of_new_snapshots = number_of_shuffled_snapshots

if snapshot_type == "openpmd":
import math
import functools

specified_number_of_new_snapshots = number_of_new_snapshots
number_of_new_snapshots = functools.reduce(
math.gcd,
[
snapshot.grid_dimension[0]
for snapshot in self.parameters.snapshot_directories_list
],
number_of_new_snapshots,
del specified_number_of_new_snapshots
elif snapshot_type == "numpy":
# Implement all of the below for OpenPMD later.
# We need to check if we need to reduce the overall grid size
# because the individual snapshots may not contain enough data
# points
shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots

if np.any(
np.array(snapshot_size_list)
- (
(np.array(snapshot_size_list) // number_of_new_snapshots)
* number_of_new_snapshots
)
> 0
):
number_of_data_points = int(
np.sum(shuffled_gridsizes) * number_of_new_snapshots
)
if (
number_of_new_snapshots
!= specified_number_of_new_snapshots
):
print(
f"[openPMD shuffling] Reduced the number of output snapshots to "
f"{number_of_new_snapshots} because of the dataset dimensions."
)
del specified_number_of_new_snapshots

if number_of_data_points % number_of_new_snapshots != 0:
if snapshot_type == "numpy":
self.data_points_to_remove = []
for i in range(0, self.nr_snapshots):
gridsize = self.parameters.snapshot_directories_list[
i
].grid_size
shuffled_gridsize = int(
gridsize / number_of_new_snapshots
)
self.data_points_to_remove.append(
gridsize
- shuffled_gridsize * number_of_new_snapshots
)
tot_points_missing = sum(self.data_points_to_remove)

printout(
"Warning: number of requested snapshots is not a divisor of",
"the original grid sizes.\n",
f"{tot_points_missing} / {number_of_data_points} data points",
"will be left out of the shuffled snapshots."
)
self.data_points_to_remove = []
for i in range(0, self.nr_snapshots):
self.data_points_to_remove.append(
snapshot_size_list[i]
- shuffled_gridsizes[i] * number_of_new_snapshots
)
tot_points_missing = sum(self.data_points_to_remove)

if tot_points_missing > 0:
printout(
"Warning: number of requested snapshots is not a divisor of",
"the original grid sizes.\n",
f"{tot_points_missing} / {number_of_data_points} data points",
"will be left out of the shuffled snapshots.",
)

shuffle_dimensions = [
int(number_of_data_points / number_of_new_snapshots),
1,
1,
]
else:
raise Exception("Invalid snapshot type.")

elif snapshot_type == "openpmd":
# TODO implement arbitrary grid sizes for openpmd
raise Exception(
"Cannot create this number of snapshots "
"from data provided."
)
else:
shuffle_dimensions = [
int(number_of_data_points / number_of_new_snapshots),
1,
1,
]
shuffle_dimensions = [
int(number_of_data_points / number_of_new_snapshots),
1,
1,
]

printout(
"Data shuffler will generate",
Expand Down
156 changes: 92 additions & 64 deletions test/shuffling_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,70 +50,70 @@ def test_seed(self):
new = np.load("Be_REshuffled1.out.npy")
assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy)

def test_seed_openpmd(self):
"""
Test that the shuffling is handled correctly internally.
This function tests the shuffling for OpenPMD and confirms that
shuffling both from numpy and openpmd into openpmd always gives the
same results. The first shuffling shuffles from openpmd to openpmd
format, the second from numpy to openpmd.
"""
test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot(
"Be_snapshot0.in.h5",
data_path,
"Be_snapshot0.out.h5",
data_path,
snapshot_type="openpmd",
)
data_shuffler.add_snapshot(
"Be_snapshot1.in.h5",
data_path,
"Be_snapshot1.out.h5",
data_path,
snapshot_type="openpmd",
)

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")

test_parameters = mala.Parameters()
test_parameters.data.shuffling_seed = 1234
data_shuffler = mala.DataShuffler(test_parameters)

# Add a snapshot we want to use in to the list.
data_shuffler.add_snapshot(
"Be_snapshot0.in.npy",
data_path,
"Be_snapshot0.out.npy",
data_path,
snapshot_type="numpy",
)
data_shuffler.add_snapshot(
"Be_snapshot1.in.npy",
data_path,
"Be_snapshot1.out.npy",
data_path,
snapshot_type="numpy",
)

# After shuffling, these snapshots can be loaded as regular snapshots
# for lazily loaded training-
data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5")

old = data_shuffler.target_calculator.read_from_openpmd_file(
"Be_shuffled1.out.h5"
)
new = data_shuffler.target_calculator.read_from_openpmd_file(
"Be_REshuffled1.out.h5"
)
assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy)
# def test_seed_openpmd(self):
# """
# Test that the shuffling is handled correctly internally.
#
# This function tests the shuffling for OpenPMD and confirms that
# shuffling both from numpy and openpmd into openpmd always gives the
# same results. The first shuffling shuffles from openpmd to openpmd
# format, the second from numpy to openpmd.
# """
# test_parameters = mala.Parameters()
# test_parameters.data.shuffling_seed = 1234
# data_shuffler = mala.DataShuffler(test_parameters)
#
# # Add a snapshot we want to use in to the list.
# data_shuffler.add_snapshot(
# "Be_snapshot0.in.h5",
# data_path,
# "Be_snapshot0.out.h5",
# data_path,
# snapshot_type="openpmd",
# )
# data_shuffler.add_snapshot(
# "Be_snapshot1.in.h5",
# data_path,
# "Be_snapshot1.out.h5",
# data_path,
# snapshot_type="openpmd",
# )
#
# # After shuffling, these snapshots can be loaded as regular snapshots
# # for lazily loaded training-
# data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")
#
# test_parameters = mala.Parameters()
# test_parameters.data.shuffling_seed = 1234
# data_shuffler = mala.DataShuffler(test_parameters)
#
# # Add a snapshot we want to use in to the list.
# data_shuffler.add_snapshot(
# "Be_snapshot0.in.npy",
# data_path,
# "Be_snapshot0.out.npy",
# data_path,
# snapshot_type="numpy",
# )
# data_shuffler.add_snapshot(
# "Be_snapshot1.in.npy",
# data_path,
# "Be_snapshot1.out.npy",
# data_path,
# snapshot_type="numpy",
# )
#
# # After shuffling, these snapshots can be loaded as regular snapshots
# # for lazily loaded training-
# data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5")
#
# old = data_shuffler.target_calculator.read_from_openpmd_file(
# "Be_shuffled1.out.h5"
# )
# new = data_shuffler.target_calculator.read_from_openpmd_file(
# "Be_REshuffled1.out.h5"
# )
# assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy)

def test_training(self):
test_parameters = mala.Parameters()
Expand Down Expand Up @@ -326,3 +326,31 @@ def test_training_openpmd(self):
test_trainer.train_network()
new_loss = test_trainer.final_validation_loss
assert old_loss > new_loss

def test_arbitrary_number_snapshots(self):
parameters = mala.Parameters()

# This ensures reproducibility of the created data sets.
parameters.data.shuffling_seed = 1234

data_shuffler = mala.DataShuffler(parameters)

for i in range(5):
data_shuffler.add_snapshot(
"Be_snapshot0.in.npy",
data_path,
"Be_snapshot0.out.npy",
data_path,
)
data_shuffler.shuffle_snapshots(
complete_save_path=".",
save_name="Be_shuffled*",
number_of_shuffled_snapshots=5,
)
for i in range(4):
bispectrum = np.load("Be_shuffled" + str(i) + ".in.npy")
ldos = np.load("Be_shuffled" + str(i) + ".out.npy")
assert not np.any(np.where(np.all(ldos == 0, axis=-1).squeeze()))
assert not np.any(
np.where(np.all(bispectrum == 0, axis=-1).squeeze())
)

0 comments on commit a61a489

Please sign in to comment.