<
)
indices = np.random.choice(
-
ngrid**3,
-
size=ngrid**3 - self.data_points_to_remove[idx],
+
ngrid,
+
size=ngrid - self.data_points_to_remove[idx],
)
descriptor_data[idx] = current_descriptor[indices]
@@ -622,117 +627,81 @@
Source code for mala.datahandling.data_shuffler
<
snapshot_type = snapshot_types.pop()
del snapshot_types
- snapshot_size_list = [
- snapshot.grid_size
- for snapshot in self.parameters.snapshot_directories_list
- ]
+ # Set the defaults, these may be changed below as needed.
+ snapshot_size_list = np.array(
+ [
+ snapshot.grid_size
+ for snapshot in self.parameters.snapshot_directories_list
+ ]
+ )
number_of_data_points = np.sum(snapshot_size_list)
-
self.data_points_to_remove = None
-
if number_of_shuffled_snapshots is None:
- # If the user does not tell us how many snapshots to use,
- # we have to check if the number of snapshots is straightforward.
- # If all snapshots have the same size, we can just replicate the
- # snapshot structure.
- if np.max(snapshot_size_list) == np.min(snapshot_size_list):
- shuffle_dimensions = self.parameters.snapshot_directories_list[
- 0
- ].grid_dimension
- number_of_new_snapshots = self.nr_snapshots
- else:
- # If the snapshots have different sizes we simply create
- # (x, 1, 1) snapshots big enough to hold the data.
- number_of_new_snapshots = self.nr_snapshots
- while number_of_data_points % number_of_new_snapshots != 0:
- number_of_new_snapshots += 1
- # If they do have different sizes, we start with the smallest
- # snapshot, there is some padding down below anyhow.
- shuffle_dimensions = [
- int(number_of_data_points / number_of_new_snapshots),
- 1,
- 1,
- ]
+ number_of_shuffled_snapshots = self.nr_snapshots
+ number_of_new_snapshots = number_of_shuffled_snapshots
+
+ if snapshot_type == "openpmd":
+ import math
+ import functools
- if snapshot_type == "openpmd":
- import math
- import functools
-
- number_of_new_snapshots = functools.reduce(
- math.gcd,
- [
- snapshot.grid_dimension[0]
- for snapshot in self.parameters.snapshot_directories_list
- ],
- number_of_new_snapshots,
+ specified_number_of_new_snapshots = number_of_new_snapshots
+ number_of_new_snapshots = functools.reduce(
+ math.gcd,
+ [
+ snapshot.grid_dimension[0]
+ for snapshot in self.parameters.snapshot_directories_list
+ ],
+ number_of_new_snapshots,
+ )
+ if number_of_new_snapshots != specified_number_of_new_snapshots:
+ print(
+ f"[openPMD shuffling] Reduced the number of output snapshots to "
+ f"{number_of_new_snapshots} because of the dataset dimensions."
)
- else:
- number_of_new_snapshots = number_of_shuffled_snapshots
-
- if snapshot_type == "openpmd":
- import math
- import functools
-
- specified_number_of_new_snapshots = number_of_new_snapshots
- number_of_new_snapshots = functools.reduce(
- math.gcd,
- [
- snapshot.grid_dimension[0]
- for snapshot in self.parameters.snapshot_directories_list
- ],
- number_of_new_snapshots,
+ del specified_number_of_new_snapshots
+ elif snapshot_type == "numpy":
+ # Implement all of the below for OpenPMD later.
+ # We need to check if we need to reduce the overall grid size
+ # because the individual snapshots may not contain enough data
+ # points
+ shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots
+
+ if np.any(
+ np.array(snapshot_size_list)
+ - (
+ (np.array(snapshot_size_list) // number_of_new_snapshots)
+ * number_of_new_snapshots
+ )
+ > 0
+ ):
+ number_of_data_points = int(
+ np.sum(shuffled_gridsizes) * number_of_new_snapshots
)
- if (
- number_of_new_snapshots
- != specified_number_of_new_snapshots
- ):
- print(
- f"[openPMD shuffling] Reduced the number of output snapshots to "
- f"{number_of_new_snapshots} because of the dataset dimensions."
- )
- del specified_number_of_new_snapshots
-
- if number_of_data_points % number_of_new_snapshots != 0:
- if snapshot_type == "numpy":
- self.data_points_to_remove = []
- for i in range(0, self.nr_snapshots):
- gridsize = self.parameters.snapshot_directories_list[
- i
- ].grid_size
- shuffled_gridsize = int(
- gridsize / number_of_new_snapshots
- )
- self.data_points_to_remove.append(
- gridsize
- - shuffled_gridsize * number_of_new_snapshots
- )
- tot_points_missing = sum(self.data_points_to_remove)
- printout(
- "Warning: number of requested snapshots is not a divisor of",
- "the original grid sizes.\n",
- f"{tot_points_missing} / {number_of_data_points} data points",
- "will be left out of the shuffled snapshots."
- )
+ self.data_points_to_remove = []
+ for i in range(0, self.nr_snapshots):
+ self.data_points_to_remove.append(
+ snapshot_size_list[i]
+ - shuffled_gridsizes[i] * number_of_new_snapshots
+ )
+ tot_points_missing = sum(self.data_points_to_remove)
+
+ if tot_points_missing > 0:
+ printout(
+ "Warning: number of requested snapshots is not a divisor of",
+ "the original grid sizes.\n",
+ f"{tot_points_missing} / {number_of_data_points} data points",
+ "will be left out of the shuffled snapshots.",
+ )
- shuffle_dimensions = [
- int(number_of_data_points / number_of_new_snapshots),
- 1,
- 1,
- ]
+ else:
+ raise Exception("Invalid snapshot type.")
- elif snapshot_type == "openpmd":
- # TODO implement arbitrary grid sizes for openpmd
- raise Exception(
- "Cannot create this number of snapshots "
- "from data provided."
- )
- else:
- shuffle_dimensions = [
- int(number_of_data_points / number_of_new_snapshots),
- 1,
- 1,
- ]
+ shuffle_dimensions = [
+ int(number_of_data_points / number_of_new_snapshots),
+ 1,
+ 1,
+ ]
printout(
"Data shuffler will generate",
diff --git a/objects.inv b/objects.inv
index 298d6b69..deffd466 100644
Binary files a/objects.inv and b/objects.inv differ