diff --git a/_modules/mala/datahandling/data_shuffler.html b/_modules/mala/datahandling/data_shuffler.html index 7a485010..af72693d 100644 --- a/_modules/mala/datahandling/data_shuffler.html +++ b/_modules/mala/datahandling/data_shuffler.html @@ -138,6 +138,7 @@

Source code for mala.datahandling.data_shuffler

< self.descriptor_calculator.parameters.descriptors_contain_xyz = ( False ) + self.data_points_to_remove = None
[docs] @@ -224,7 +225,11 @@

Source code for mala.datahandling.data_shuffler

< if self.data_points_to_remove is not None: if self.parameters.shuffling_seed is not None: np.random.seed(idx * self.parameters.shuffling_seed) - ngrid = descriptor_data[idx].shape[0] + ngrid = ( + descriptor_data[idx].shape[0] + * descriptor_data[idx].shape[1] + * descriptor_data[idx].shape[2] + ) n_descriptor = descriptor_data[idx].shape[-1] n_target = target_data[idx].shape[-1] @@ -234,8 +239,8 @@

Source code for mala.datahandling.data_shuffler

< ) indices = np.random.choice( - ngrid**3, - size=ngrid**3 - self.data_points_to_remove[idx], + ngrid, + size=ngrid - self.data_points_to_remove[idx], ) descriptor_data[idx] = current_descriptor[indices] @@ -622,117 +627,81 @@

Source code for mala.datahandling.data_shuffler

< snapshot_type = snapshot_types.pop() del snapshot_types - snapshot_size_list = [ - snapshot.grid_size - for snapshot in self.parameters.snapshot_directories_list - ] + # Set the defaults, these may be changed below as needed. + snapshot_size_list = np.array( + [ + snapshot.grid_size + for snapshot in self.parameters.snapshot_directories_list + ] + ) number_of_data_points = np.sum(snapshot_size_list) - self.data_points_to_remove = None - if number_of_shuffled_snapshots is None: - # If the user does not tell us how many snapshots to use, - # we have to check if the number of snapshots is straightforward. - # If all snapshots have the same size, we can just replicate the - # snapshot structure. - if np.max(snapshot_size_list) == np.min(snapshot_size_list): - shuffle_dimensions = self.parameters.snapshot_directories_list[ - 0 - ].grid_dimension - number_of_new_snapshots = self.nr_snapshots - else: - # If the snapshots have different sizes we simply create - # (x, 1, 1) snapshots big enough to hold the data. - number_of_new_snapshots = self.nr_snapshots - while number_of_data_points % number_of_new_snapshots != 0: - number_of_new_snapshots += 1 - # If they do have different sizes, we start with the smallest - # snapshot, there is some padding down below anyhow. - shuffle_dimensions = [ - int(number_of_data_points / number_of_new_snapshots), - 1, - 1, - ] + number_of_shuffled_snapshots = self.nr_snapshots + number_of_new_snapshots = number_of_shuffled_snapshots + + if snapshot_type == "openpmd": + import math + import functools - if snapshot_type == "openpmd": - import math - import functools - - number_of_new_snapshots = functools.reduce( - math.gcd, - [ - snapshot.grid_dimension[0] - for snapshot in self.parameters.snapshot_directories_list - ], - number_of_new_snapshots, + specified_number_of_new_snapshots = number_of_new_snapshots + number_of_new_snapshots = functools.reduce( + math.gcd, + [ + snapshot.grid_dimension[0] + for snapshot in self.parameters.snapshot_directories_list + ], + number_of_new_snapshots, + ) + if number_of_new_snapshots != specified_number_of_new_snapshots: + print( + f"[openPMD shuffling] Reduced the number of output snapshots to " + f"{number_of_new_snapshots} because of the dataset dimensions." ) - else: - number_of_new_snapshots = number_of_shuffled_snapshots - - if snapshot_type == "openpmd": - import math - import functools - - specified_number_of_new_snapshots = number_of_new_snapshots - number_of_new_snapshots = functools.reduce( - math.gcd, - [ - snapshot.grid_dimension[0] - for snapshot in self.parameters.snapshot_directories_list - ], - number_of_new_snapshots, + del specified_number_of_new_snapshots + elif snapshot_type == "numpy": + # Implement all of the below for OpenPMD later. + # We need to check if we need to reduce the overall grid size + # because the individual snapshots may not contain enough data + # points + shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots + + if np.any( + np.array(snapshot_size_list) + - ( + (np.array(snapshot_size_list) // number_of_new_snapshots) + * number_of_new_snapshots + ) + > 0 + ): + number_of_data_points = int( + np.sum(shuffled_gridsizes) * number_of_new_snapshots ) - if ( - number_of_new_snapshots - != specified_number_of_new_snapshots - ): - print( - f"[openPMD shuffling] Reduced the number of output snapshots to " - f"{number_of_new_snapshots} because of the dataset dimensions." - ) - del specified_number_of_new_snapshots - - if number_of_data_points % number_of_new_snapshots != 0: - if snapshot_type == "numpy": - self.data_points_to_remove = [] - for i in range(0, self.nr_snapshots): - gridsize = self.parameters.snapshot_directories_list[ - i - ].grid_size - shuffled_gridsize = int( - gridsize / number_of_new_snapshots - ) - self.data_points_to_remove.append( - gridsize - - shuffled_gridsize * number_of_new_snapshots - ) - tot_points_missing = sum(self.data_points_to_remove) - printout( - "Warning: number of requested snapshots is not a divisor of", - "the original grid sizes.\n", - f"{tot_points_missing} / {number_of_data_points} data points", - "will be left out of the shuffled snapshots." - ) + self.data_points_to_remove = [] + for i in range(0, self.nr_snapshots): + self.data_points_to_remove.append( + snapshot_size_list[i] + - shuffled_gridsizes[i] * number_of_new_snapshots + ) + tot_points_missing = sum(self.data_points_to_remove) + + if tot_points_missing > 0: + printout( + "Warning: number of requested snapshots is not a divisor of", + "the original grid sizes.\n", + f"{tot_points_missing} / {number_of_data_points} data points", + "will be left out of the shuffled snapshots.", + ) - shuffle_dimensions = [ - int(number_of_data_points / number_of_new_snapshots), - 1, - 1, - ] + else: + raise Exception("Invalid snapshot type.") - elif snapshot_type == "openpmd": - # TODO implement arbitrary grid sizes for openpmd - raise Exception( - "Cannot create this number of snapshots " - "from data provided." - ) - else: - shuffle_dimensions = [ - int(number_of_data_points / number_of_new_snapshots), - 1, - 1, - ] + shuffle_dimensions = [ + int(number_of_data_points / number_of_new_snapshots), + 1, + 1, + ] printout( "Data shuffler will generate", diff --git a/objects.inv b/objects.inv index 298d6b69..deffd466 100644 Binary files a/objects.inv and b/objects.inv differ