From 5134fd2ad8e6d2d058ec44bff07421d5cd79a53f Mon Sep 17 00:00:00 2001
From: RandomDefaultUser Developing codetest_examples.py
MALA uses black
for code formatting
The black
configuration is located in pyproject.toml
Currently, no automatic code reformatting will be done in the CI, thus +please ensure that your code is properly formatted before creating a pull +request
If you add additional dependencies, make sure to add them to requirements.txt
diff --git a/_modules/mala/common/check_modules.html b/_modules/mala/common/check_modules.html
index 004b69e2c..e8d219da7 100644
--- a/_modules/mala/common/check_modules.html
+++ b/_modules/mala/common/check_modules.html
@@ -75,6 +75,7 @@
"""Function to check module availability in MALA."""
+
import importlib
@@ -82,40 +83,62 @@ Source code for mala.common.check_modules
"""Check whether/which optional modules MALA can access."""
# The optional libs in MALA.
optional_libs = {
- "mpi4py": {"available": False, "description":
- "Enables inference parallelization."},
- "horovod": {"available": False, "description":
- "Enables training parallelization."},
- "lammps": {"available": False, "description":
- "Enables descriptor calculation for data preprocessing "
- "and inference."},
- "oapackage": {"available": False, "description":
- "Enables usage of OAT method for hyperparameter "
- "optimization."},
- "total_energy": {"available": False, "description":
- "Enables calculation of total energy."},
- "asap3": {"available": False, "description":
- "Enables trajectory analysis."},
- "dftpy": {"available": False, "description":
- "Enables OF-DFT-MD initialization."},
- "minterpy": {"available": False, "description":
- "Enables minterpy descriptor calculation for data preprocessing."}
+ "mpi4py": {
+ "available": False,
+ "description": "Enables inference parallelization.",
+ },
+ "horovod": {
+ "available": False,
+ "description": "Enables training parallelization.",
+ },
+ "lammps": {
+ "available": False,
+ "description": "Enables descriptor calculation for data preprocessing "
+ "and inference.",
+ },
+ "oapackage": {
+ "available": False,
+ "description": "Enables usage of OAT method for hyperparameter "
+ "optimization.",
+ },
+ "total_energy": {
+ "available": False,
+ "description": "Enables calculation of total energy.",
+ },
+ "asap3": {
+ "available": False,
+ "description": "Enables trajectory analysis.",
+ },
+ "dftpy": {
+ "available": False,
+ "description": "Enables OF-DFT-MD initialization.",
+ },
+ "minterpy": {
+ "available": False,
+ "description": "Enables minterpy descriptor calculation for data preprocessing.",
+ },
}
# Find out if libs are available.
for lib in optional_libs:
- optional_libs[lib]["available"] = importlib.util.find_spec(lib) \
- is not None
+ optional_libs[lib]["available"] = (
+ importlib.util.find_spec(lib) is not None
+ )
# Print info about libs.
print("The following optional modules are available in MALA:")
for lib in optional_libs:
- available_string = "installed" if optional_libs[lib]["available"] \
- else "not installed"
- print("{0}: \t {1} \t {2}".format(lib, available_string,
- optional_libs[lib]["description"]))
- optional_libs[lib]["available"] = \
- importlib.util.find_spec(lib) is not None
+ available_string = (
+ "installed" if optional_libs[lib]["available"] else "not installed"
+ )
+ print(
+ "{0}: \t {1} \t {2}".format(
+ lib, available_string, optional_libs[lib]["description"]
+ )
+ )
+ optional_libs[lib]["available"] = (
+ importlib.util.find_spec(lib) is not None
+ )
"""Functions for operating MALA in parallel."""
+
from collections import defaultdict
import platform
import warnings
@@ -122,8 +123,10 @@ Source code for mala.common.parallelizer
"""
if use_mpi is True and new_value is True:
- raise Exception("Cannot use horovod and inference-level MPI at "
- "the same time yet.")
+ raise Exception(
+ "Cannot use horovod and inference-level MPI at "
+ "the same time yet."
+ )
global use_horovod
use_horovod = new_value
@@ -142,8 +145,10 @@ Source code for mala.common.parallelizer
"""
if use_horovod is True and new_value is True:
- raise Exception("Cannot use horovod and inference-level MPI at "
- "the same time yet.")
+ raise Exception(
+ "Cannot use horovod and inference-level MPI at "
+ "the same time yet."
+ )
global use_mpi
use_mpi = new_value
if use_mpi:
@@ -172,6 +177,7 @@ Source code for mala.common.parallelizer
"""
import lammps
+
global lammps_instance
if isinstance(new_instance, lammps.core.lammps):
lammps_instance = new_instance
@@ -238,7 +244,7 @@ Source code for mala.common.parallelizer
ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
node2rankssofar = defaultdict(int)
local_rank = None
- for (rank, node) in ranks_nodes:
+ for rank, node in ranks_nodes:
if rank == comm.Get_rank():
local_rank = node2rankssofar[node]
node2rankssofar[node] += 1
@@ -280,13 +286,13 @@ Source code for mala.common.parallelizer
[docs]def barrier():
"""General interface for a barrier."""
if use_horovod:
- hvd.allreduce(torch.tensor(0), name='barrier')
+ hvd.allreduce(torch.tensor(0), name="barrier")
if use_mpi:
comm.Barrier()
return
-[docs]def printout(*values, sep=' ', min_verbosity=0):
+[docs]def printout(*values, sep=" ", min_verbosity=0):
"""
Interface to built-in "print" for parallel runs. Can be used like print.
diff --git a/_modules/mala/common/parameters.html b/_modules/mala/common/parameters.html
index a0abd18c2..665e2c379 100644
--- a/_modules/mala/common/parameters.html
+++ b/_modules/mala/common/parameters.html
@@ -75,6 +75,7 @@
Source code for mala.common.parameters
"""Collection of all parameter related classes and functions."""
+
import importlib
import inspect
import json
@@ -85,15 +86,22 @@ Source code for mala.common.parameters
horovod_available = False
try:
import horovod.torch as hvd
+
horovod_available = True
except ModuleNotFoundError:
pass
import numpy as np
import torch
-from mala.common.parallelizer import printout, set_horovod_status, \
- set_mpi_status, get_rank, get_local_rank, set_current_verbosity, \
- parallel_warn
+from mala.common.parallelizer import (
+ printout,
+ set_horovod_status,
+ set_mpi_status,
+ get_rank,
+ get_local_rank,
+ set_current_verbosity,
+ parallel_warn,
+)
from mala.common.json_serializable import JSONSerializable
DEFAULT_NP_DATA_DTYPE = np.float32
@@ -102,11 +110,19 @@ Source code for mala.common.parameters
[docs]class ParametersBase(JSONSerializable):
"""Base parameter class for MALA."""
- def __init__(self,):
+ def __init__(
+ self,
+ ):
super(ParametersBase, self).__init__()
- self._configuration = {"gpu": False, "horovod": False, "mpi": False,
- "device": "cpu", "openpmd_configuration": {},
- "openpmd_granularity": 1, "lammps": True}
+ self._configuration = {
+ "gpu": False,
+ "horovod": False,
+ "mpi": False,
+ "device": "cpu",
+ "openpmd_configuration": {},
+ "openpmd_granularity": 1,
+ "lammps": True,
+ }
pass
[docs] def show(self, indent=""):
@@ -123,11 +139,15 @@ Source code for mala.common.parameters
for v in vars(self):
if v != "_configuration":
if v[0] == "_":
- printout(indent + '%-15s: %s' % (v[1:], getattr(self, v)),
- min_verbosity=0)
+ printout(
+ indent + "%-15s: %s" % (v[1:], getattr(self, v)),
+ min_verbosity=0,
+ )
else:
- printout(indent + '%-15s: %s' % (v, getattr(self, v)),
- min_verbosity=0)
+ printout(
+ indent + "%-15s: %s" % (v, getattr(self, v)),
+ min_verbosity=0,
+ )
def _update_gpu(self, new_gpu):
self._configuration["gpu"] = new_gpu
@@ -168,8 +188,9 @@ Source code for mala.common.parameters
"""
json_dict = {}
- members = inspect.getmembers(self,
- lambda a: not (inspect.isroutine(a)))
+ members = inspect.getmembers(
+ self, lambda a: not (inspect.isroutine(a))
+ )
for member in members:
# Filter out all private members, builtins, etc.
if member[0][0] != "_":
@@ -217,8 +238,9 @@ Source code for mala.common.parameters
else:
# If it is not an elementary builtin type AND not an object
# dictionary, something is definitely off.
- raise Exception("Could not decode JSON file, error in",
- json_value)
+ raise Exception(
+ "Could not decode JSON file, error in", json_value
+ )
[docs] @classmethod
def from_json(cls, json_dict):
@@ -249,8 +271,9 @@ Source code for mala.common.parameters
if len(json_dict[key]) > 0:
_member = []
for m in json_dict[key]:
- _member.append(deserialized_object.
- _json_to_member(m))
+ _member.append(
+ deserialized_object._json_to_member(m)
+ )
setattr(deserialized_object, key, _member)
else:
setattr(deserialized_object, key, json_dict[key])
@@ -259,16 +282,20 @@ Source code for mala.common.parameters
if len(json_dict[key]) > 0:
_member = {}
for m in json_dict[key].keys():
- _member[m] = deserialized_object.\
- _json_to_member(json_dict[key][m])
+ _member[m] = deserialized_object._json_to_member(
+ json_dict[key][m]
+ )
setattr(deserialized_object, key, _member)
else:
setattr(deserialized_object, key, json_dict[key])
else:
- setattr(deserialized_object, key, deserialized_object.
- _json_to_member(json_dict[key]))
+ setattr(
+ deserialized_object,
+ key,
+ deserialized_object._json_to_member(json_dict[key]),
+ )
return deserialized_object
@@ -813,7 +840,7 @@ Source code for mala.common.parameters
self.use_mixed_precision = False
self.use_graphs = False
self.training_report_frequency = 1000
- self.profiler_range = None #[1000, 2000]
+ self.profiler_range = None # [1000, 2000]
def _update_horovod(self, new_horovod):
super(ParametersRunning, self)._update_horovod(new_horovod)
@@ -839,8 +866,10 @@ Source code for mala.common.parameters
def during_training_metric(self, value):
if value != "ldos":
if self._configuration["horovod"]:
- raise Exception("Currently, MALA can only operate with the "
- "\"ldos\" metric for horovod runs.")
+ raise Exception(
+ "Currently, MALA can only operate with the "
+ '"ldos" metric for horovod runs.'
+ )
self._during_training_metric = value
@property
@@ -862,16 +891,20 @@ Source code for mala.common.parameters
def after_before_training_metric(self, value):
if value != "ldos":
if self._configuration["horovod"]:
- raise Exception("Currently, MALA can only operate with the "
- "\"ldos\" metric for horovod runs.")
+ raise Exception(
+ "Currently, MALA can only operate with the "
+ '"ldos" metric for horovod runs.'
+ )
self._after_before_training_metric = value
@during_training_metric.setter
def during_training_metric(self, value):
if value != "ldos":
if self._configuration["horovod"]:
- raise Exception("Currently, MALA can only operate with the "
- "\"ldos\" metric for horovod runs.")
+ raise Exception(
+ "Currently, MALA can only operate with the "
+ '"ldos" metric for horovod runs.'
+ )
self._during_training_metric = value
@property
@@ -887,14 +920,18 @@ Source code for mala.common.parameters
@use_graphs.setter
def use_graphs(self, value):
if value is True:
- if self._configuration["gpu"] is False or \
- torch.version.cuda is None:
+ if (
+ self._configuration["gpu"] is False
+ or torch.version.cuda is None
+ ):
parallel_warn("No CUDA or GPU found, cannot use CUDA graphs.")
value = False
else:
if float(torch.version.cuda) < 11.0:
- raise Exception("Cannot use CUDA graphs with a CUDA"
- " version below 11.0")
+ raise Exception(
+ "Cannot use CUDA graphs with a CUDA"
+ " version below 11.0"
+ )
self._use_graphs = value
@@ -1030,7 +1067,7 @@ Source code for mala.common.parameters
def __init__(self):
super(ParametersHyperparameterOptimization, self).__init__()
- self.direction = 'minimize'
+ self.direction = "minimize"
self.n_trials = 100
self.hlist = []
self.hyper_opt_method = "optuna"
@@ -1110,18 +1147,24 @@ Source code for mala.common.parameters
if v != "_configuration":
if v != "hlist":
if v[0] == "_":
- printout(indent + '%-15s: %s' %
- (v[1:], getattr(self, v)), min_verbosity=0)
+ printout(
+ indent + "%-15s: %s" % (v[1:], getattr(self, v)),
+ min_verbosity=0,
+ )
else:
printout(
- indent + '%-15s: %s' % (v, getattr(self, v)),
- min_verbosity=0)
+ indent + "%-15s: %s" % (v, getattr(self, v)),
+ min_verbosity=0,
+ )
if v == "hlist":
i = 0
for hyp in self.hlist:
- printout(indent + '%-15s: %s' %
- ("hyperparameter #"+str(i), hyp.name),
- min_verbosity=0)
+ printout(
+ indent
+ + "%-15s: %s"
+ % ("hyperparameter #" + str(i), hyp.name),
+ min_verbosity=0,
+ )
i += 1
@@ -1285,7 +1328,9 @@ Source code for mala.common.parameters
self.targets._update_openpmd_granularity(self._openpmd_granularity)
self.data._update_openpmd_granularity(self._openpmd_granularity)
self.running._update_openpmd_granularity(self._openpmd_granularity)
- self.hyperparameters._update_openpmd_granularity(self._openpmd_granularity)
+ self.hyperparameters._update_openpmd_granularity(
+ self._openpmd_granularity
+ )
@property
def verbosity(self):
@@ -1320,8 +1365,10 @@ Source code for mala.common.parameters
if torch.cuda.is_available():
self._use_gpu = True
else:
- parallel_warn("GPU requested, but no GPU found. MALA will "
- "operate with CPU only.")
+ parallel_warn(
+ "GPU requested, but no GPU found. MALA will "
+ "operate with CPU only."
+ )
# Invalidate, will be updated in setter.
self.device = None
@@ -1355,9 +1402,10 @@ Source code for mala.common.parameters
self.running._update_horovod(self.use_horovod)
self.hyperparameters._update_horovod(self.use_horovod)
else:
- parallel_warn("Horovod requested, but not installed found. "
- "MALA will operate without horovod only.")
-
+ parallel_warn(
+ "Horovod requested, but not installed found. "
+ "MALA will operate without horovod only."
+ )
@property
def device(self):
@@ -1368,8 +1416,7 @@ Source code for mala.common.parameters
def device(self, value):
device_id = get_local_rank()
if self.use_gpu:
- self._device = "cuda:"\
- f"{device_id}"
+ self._device = "cuda:" f"{device_id}"
else:
self._device = "cpu"
self.network._update_device(self._device)
@@ -1413,11 +1460,15 @@ Source code for mala.common.parameters
def openpmd_configuration(self, value):
self._openpmd_configuration = value
self.network._update_openpmd_configuration(self.openpmd_configuration)
- self.descriptors._update_openpmd_configuration(self.openpmd_configuration)
+ self.descriptors._update_openpmd_configuration(
+ self.openpmd_configuration
+ )
self.targets._update_openpmd_configuration(self.openpmd_configuration)
self.data._update_openpmd_configuration(self.openpmd_configuration)
self.running._update_openpmd_configuration(self.openpmd_configuration)
- self.hyperparameters._update_openpmd_configuration(self.openpmd_configuration)
+ self.hyperparameters._update_openpmd_configuration(
+ self.openpmd_configuration
+ )
@property
def use_lammps(self):
@@ -1436,8 +1487,9 @@ Source code for mala.common.parameters
[docs] def show(self):
"""Print name and values of all attributes of this object."""
- printout("--- " + self.__doc__.split("\n")[1] + " ---",
- min_verbosity=0)
+ printout(
+ "--- " + self.__doc__.split("\n")[1] + " ---", min_verbosity=0
+ )
# Two for-statements so that global parameters are shown on top.
for v in vars(self):
@@ -1445,16 +1497,21 @@ Source code for mala.common.parameters
pass
else:
if v[0] == "_":
- printout('%-15s: %s' % (v[1:], getattr(self, v)),
- min_verbosity=0)
+ printout(
+ "%-15s: %s" % (v[1:], getattr(self, v)),
+ min_verbosity=0,
+ )
else:
- printout('%-15s: %s' % (v, getattr(self, v)),
- min_verbosity=0)
+ printout(
+ "%-15s: %s" % (v, getattr(self, v)), min_verbosity=0
+ )
for v in vars(self):
if isinstance(getattr(self, v), ParametersBase):
parobject = getattr(self, v)
- printout("--- " + parobject.__doc__.split("\n")[1] + " ---",
- min_verbosity=0)
+ printout(
+ "--- " + parobject.__doc__.split("\n")[1] + " ---",
+ min_verbosity=0,
+ )
parobject.show("\t")
[docs] def save(self, filename, save_format="json"):
@@ -1477,14 +1534,15 @@ Source code for mala.common.parameters
if save_format == "pickle":
if filename[-3:] != "pkl":
filename += ".pkl"
- with open(filename, 'wb') as handle:
+ with open(filename, "wb") as handle:
pickle.dump(self, handle, protocol=4)
elif save_format == "json":
if filename[-4:] != "json":
filename += ".json"
json_dict = {}
- members = inspect.getmembers(self,
- lambda a: not (inspect.isroutine(a)))
+ members = inspect.getmembers(
+ self, lambda a: not (inspect.isroutine(a))
+ )
# Two for loops so global properties enter the dict first.
for member in members:
@@ -1556,7 +1614,7 @@ Source code for mala.common.parameters
self.use_gpu = True
self.use_mpi = True
device_temp = self.device
- sleep(get_rank()*wait_time)
+ sleep(get_rank() * wait_time)
# Now we can turn of MPI and set the device manually.
self.use_mpi = False
@@ -1569,8 +1627,7 @@ Source code for mala.common.parameters
self.hyperparameters._update_device(device_temp)
[docs] @classmethod
- def load_from_file(cls, file, save_format="json",
- no_snapshots=False):
+ def load_from_file(cls, file, save_format="json", no_snapshots=False):
"""
Load a Parameters object from a file.
@@ -1595,7 +1652,7 @@ Source code for mala.common.parameters
"""
if save_format == "pickle":
if isinstance(file, str):
- loaded_parameters = pickle.load(open(file, 'rb'))
+ loaded_parameters = pickle.load(open(file, "rb"))
else:
loaded_parameters = pickle.load(file)
if no_snapshots is True:
@@ -1608,19 +1665,23 @@ Source code for mala.common.parameters
loaded_parameters = cls()
for key in json_dict:
- if isinstance(json_dict[key], dict) and key \
- != "openpmd_configuration":
+ if (
+ isinstance(json_dict[key], dict)
+ and key != "openpmd_configuration"
+ ):
# These are the other parameter classes.
- sub_parameters =\
- globals()[json_dict[key]["_parameters_type"]].\
- from_json(json_dict[key])
+ sub_parameters = globals()[
+ json_dict[key]["_parameters_type"]
+ ].from_json(json_dict[key])
setattr(loaded_parameters, key, sub_parameters)
# We iterate a second time, to set global values, so that they
# are properly forwarded.
for key in json_dict:
- if not isinstance(json_dict[key], dict) or key == \
- "openpmd_configuration":
+ if (
+ not isinstance(json_dict[key], dict)
+ or key == "openpmd_configuration"
+ ):
setattr(loaded_parameters, key, json_dict[key])
if no_snapshots is True:
loaded_parameters.data.snapshot_directories_list = []
@@ -1649,8 +1710,9 @@ Source code for mala.common.parameters
The loaded Parameters object.
"""
- return Parameters.load_from_file(file, save_format="pickle",
- no_snapshots=no_snapshots)
+ return Parameters.load_from_file(
+ file, save_format="pickle", no_snapshots=no_snapshots
+ )
[docs] @classmethod
def load_from_json(cls, file, no_snapshots=False):
@@ -1672,8 +1734,9 @@ Source code for mala.common.parameters
The loaded Parameters object.
"""
- return Parameters.load_from_file(file, save_format="json",
- no_snapshots=no_snapshots)
+ return Parameters.load_from_file(
+ file, save_format="json", no_snapshots=no_snapshots
+ )
diff --git a/_modules/mala/common/physical_data.html b/_modules/mala/common/physical_data.html
index 5f496a5d4..dfcf1776a 100644
--- a/_modules/mala/common/physical_data.html
+++ b/_modules/mala/common/physical_data.html
@@ -75,6 +75,7 @@
Source code for mala.common.physical_data
"""Base class for all calculators that deal with physical data."""
+
from abc import ABC, abstractmethod
import os
@@ -143,7 +144,9 @@ Source code for mala.common.physical_data
# because there is no need to.
##############################
-[docs] def read_from_numpy_file(self, path, units=None, array=None, reshape=False):
+[docs] def read_from_numpy_file(
+ self, path, units=None, array=None, reshape=False
+ ):
"""
Read the data from a numpy file.
@@ -168,17 +171,19 @@ Source code for mala.common.physical_data
"""
if array is None:
- loaded_array = np.load(path)[:, :, :, self._feature_mask():]
+ loaded_array = np.load(path)[:, :, :, self._feature_mask() :]
self._process_loaded_array(loaded_array, units=units)
return loaded_array
else:
if reshape:
array_dims = np.shape(array)
- array[:, :] = np.load(path)[:, :, :, self._feature_mask() :].reshape(
- array_dims
- )
+ array[:, :] = np.load(path)[
+ :, :, :, self._feature_mask() :
+ ].reshape(array_dims)
else:
- array[:, :, :, :] = np.load(path)[:, :, :, self._feature_mask() :]
+ array[:, :, :, :] = np.load(path)[
+ :, :, :, self._feature_mask() :
+ ]
self._process_loaded_array(array, units=units)
[docs] def read_from_openpmd_file(self, path, units=None, array=None):
@@ -216,15 +221,19 @@ Source code for mala.common.physical_data
# {"defer_iteration_parsing": True} |
# self.parameters.
# _configuration["openpmd_configuration"]))
- options = self.parameters._configuration["openpmd_configuration"].copy()
+ options = self.parameters._configuration[
+ "openpmd_configuration"
+ ].copy()
options["defer_iteration_parsing"] = True
- series = io.Series(path, io.Access.read_only,
- options=json.dumps(options))
+ series = io.Series(
+ path, io.Access.read_only, options=json.dumps(options)
+ )
# Check if this actually MALA compatible data.
if series.get_attribute("is_mala_data") != 1:
- raise Exception("Non-MALA data detected, cannot work with this "
- "data.")
+ raise Exception(
+ "Non-MALA data detected, cannot work with this data."
+ )
# A bit clanky, but this way only the FIRST iteration is loaded,
# which is what we need for loading from a single file that
@@ -243,24 +252,35 @@ Source code for mala.common.physical_data
# the feature dimension with 0,1,... ? I can't think of one.
# But there may be in the future, and this'll break
if array is None:
- data = np.zeros((mesh["0"].shape[0], mesh["0"].shape[1],
- mesh["0"].shape[2], len(mesh)-self._feature_mask()),
- dtype=mesh["0"].dtype)
+ data = np.zeros(
+ (
+ mesh["0"].shape[0],
+ mesh["0"].shape[1],
+ mesh["0"].shape[2],
+ len(mesh) - self._feature_mask(),
+ ),
+ dtype=mesh["0"].dtype,
+ )
else:
- if array.shape[0] != mesh["0"].shape[0] or \
- array.shape[1] != mesh["0"].shape[1] or \
- array.shape[2] != mesh["0"].shape[2] or \
- array.shape[3] != len(mesh)-self._feature_mask():
- raise Exception("Cannot load data into array, wrong "
- "shape provided.")
+ if (
+ array.shape[0] != mesh["0"].shape[0]
+ or array.shape[1] != mesh["0"].shape[1]
+ or array.shape[2] != mesh["0"].shape[2]
+ or array.shape[3] != len(mesh) - self._feature_mask()
+ ):
+ raise Exception(
+ "Cannot load data into array, wrong shape provided."
+ )
# Only check this once, since we do not save arrays with different
# units throughout the feature dimension.
# Later, we can merge this unit check with the unit conversion
# MALA does naturally.
if not np.isclose(mesh[str(0)].unit_SI, self.si_unit_conversion):
- raise Exception("MALA currently cannot operate with OpenPMD "
- "files with non-MALA units.")
+ raise Exception(
+ "MALA currently cannot operate with OpenPMD "
+ "files with non-MALA units."
+ )
# Deal with `granularity` items of the vectors at a time
# Or in the openPMD layout: with `granularity` record components
@@ -272,21 +292,35 @@ Source code for mala.common.physical_data
else:
array_shape = array.shape
data_type = array.dtype
- for base in range(self._feature_mask(), array_shape[3]+self._feature_mask(),
- granularity):
- end = min(base + granularity, array_shape[3]+self._feature_mask())
+ for base in range(
+ self._feature_mask(),
+ array_shape[3] + self._feature_mask(),
+ granularity,
+ ):
+ end = min(
+ base + granularity, array_shape[3] + self._feature_mask()
+ )
transposed = np.empty(
(end - base, array_shape[0], array_shape[1], array_shape[2]),
- dtype=data_type)
+ dtype=data_type,
+ )
for i in range(base, end):
mesh[str(i)].load_chunk(transposed[i - base, :, :, :])
series.flush()
if array is None:
- data[:, :, :, base-self._feature_mask():end-self._feature_mask()] \
- = np.transpose(transposed, axes=[1, 2, 3, 0])[:, :, :, :]
+ data[
+ :,
+ :,
+ :,
+ base - self._feature_mask() : end - self._feature_mask(),
+ ] = np.transpose(transposed, axes=[1, 2, 3, 0])[:, :, :, :]
else:
- array[:, :, :, base-self._feature_mask():end-self._feature_mask()] \
- = np.transpose(transposed, axes=[1, 2, 3, 0])[:, :, :, :]
+ array[
+ :,
+ :,
+ :,
+ base - self._feature_mask() : end - self._feature_mask(),
+ ] = np.transpose(transposed, axes=[1, 2, 3, 0])[:, :, :, :]
if array is None:
self._process_loaded_array(data, units=units)
@@ -308,13 +342,16 @@ Source code for mala.common.physical_data
"""
loaded_array = np.load(path, mmap_mode="r")
if read_dtype:
- return self._process_loaded_dimensions(np.shape(loaded_array)), \
- loaded_array.dtype
+ return (
+ self._process_loaded_dimensions(np.shape(loaded_array)),
+ loaded_array.dtype,
+ )
else:
return self._process_loaded_dimensions(np.shape(loaded_array))
-[docs] def read_dimensions_from_openpmd_file(self, path, comm=None,
- read_dtype=False):
+[docs] def read_dimensions_from_openpmd_file(
+ self, path, comm=None, read_dtype=False
+ ):
"""
Read only the dimensions from a openPMD file.
@@ -328,6 +365,7 @@ Source code for mala.common.physical_data
"""
if comm is None or comm.rank == 0:
import openpmd_api as io
+
# The union operator for dicts is only supported starting with
# python 3.9. Currently, MALA works down to python 3.8; For now,
# I think it is good to keep it that way.
@@ -339,17 +377,18 @@ Source code for mala.common.physical_data
# self.parameters.
# _configuration["openpmd_configuration"]))
options = self.parameters._configuration[
- "openpmd_configuration"].copy()
+ "openpmd_configuration"
+ ].copy()
options["defer_iteration_parsing"] = True
- series = io.Series(path,
- io.Access.read_only,
- options=json.dumps(options))
+ series = io.Series(
+ path, io.Access.read_only, options=json.dumps(options)
+ )
# Check if this actually MALA compatible data.
if series.get_attribute("is_mala_data") != 1:
raise Exception(
- "Non-MALA data detected, cannot work with this "
- "data.")
+ "Non-MALA data detected, cannot work with this data."
+ )
# A bit clanky, but this way only the FIRST iteration is loaded,
# which is what we need for loading from a single file that
@@ -359,8 +398,12 @@ Source code for mala.common.physical_data
# and no others.
for current_iteration in series.read_iterations():
mesh = current_iteration.meshes[self.data_name]
- tuple_from_file = [mesh["0"].shape[0], mesh["0"].shape[1],
- mesh["0"].shape[2], len(mesh)]
+ tuple_from_file = [
+ mesh["0"].shape[0],
+ mesh["0"].shape[1],
+ mesh["0"].shape[2],
+ len(mesh),
+ ]
loaded_dtype = mesh["0"].dtype
break
series.close()
@@ -370,8 +413,10 @@ Source code for mala.common.physical_data
if comm is not None:
tuple_from_file = comm.bcast(tuple_from_file, root=0)
if read_dtype:
- return self._process_loaded_dimensions(tuple(tuple_from_file)), \
- loaded_dtype
+ return (
+ self._process_loaded_dimensions(tuple(tuple_from_file)),
+ loaded_dtype,
+ )
else:
return self._process_loaded_dimensions(tuple(tuple_from_file))
@@ -418,8 +463,13 @@ Source code for mala.common.physical_data
self.dataset = dataset
self.feature_size = feature_size
-[docs] def write_to_openpmd_file(self, path, array, additional_attributes={},
- internal_iteration_number=0):
+[docs] def write_to_openpmd_file(
+ self,
+ path,
+ array,
+ additional_attributes={},
+ internal_iteration_number=0,
+ ):
"""
Write data to an OpenPMD file.
@@ -449,21 +499,24 @@ Source code for mala.common.physical_data
if file_name == file_ending:
path += ".h5"
elif file_ending not in io.file_extensions:
- raise Exception("Invalid file ending selected: " +
- file_ending)
+ raise Exception("Invalid file ending selected: " + file_ending)
if self.parameters._configuration["mpi"]:
series = io.Series(
path,
io.Access.create,
get_comm(),
options=json.dumps(
- self.parameters._configuration["openpmd_configuration"]))
+ self.parameters._configuration["openpmd_configuration"]
+ ),
+ )
else:
series = io.Series(
path,
io.Access.create,
options=json.dumps(
- self.parameters._configuration["openpmd_configuration"]))
+ self.parameters._configuration["openpmd_configuration"]
+ ),
+ )
elif isinstance(path, io.Series):
series = path
@@ -478,18 +531,24 @@ Source code for mala.common.physical_data
# This function may be called without the feature dimension
# explicitly set (i.e. during testing or post-processing).
# We have to check for that.
- if self.feature_size == 0 and not isinstance(array,
- self.SkipArrayWriting):
+ if self.feature_size == 0 and not isinstance(
+ array, self.SkipArrayWriting
+ ):
self._set_feature_size_from_array(array)
self.write_to_openpmd_iteration(iteration, array)
return series
-[docs] def write_to_openpmd_iteration(self, iteration, array,
- local_offset=None,
- local_reach=None,
- additional_metadata=None,
- feature_from=0, feature_to=None):
+[docs] def write_to_openpmd_iteration(
+ self,
+ iteration,
+ array,
+ local_offset=None,
+ local_reach=None,
+ additional_metadata=None,
+ feature_from=0,
+ feature_to=None,
+ ):
"""
Write a file within an OpenPMD iteration.
@@ -532,39 +591,50 @@ Source code for mala.common.physical_data
atomic_numbers = atoms_ase.get_atomic_numbers()
positions = io.Dataset(
# Need bugfix https://github.com/openPMD/openPMD-api/pull/1357
- atomic_positions[0].dtype if io.__version__ >= '0.15.0' else
- io.Datatype.DOUBLE,
- atomic_positions[0].shape)
- numbers = io.Dataset(atomic_numbers[0].dtype,
- [1])
- iteration.set_attribute("periodic_boundary_conditions_x",
- atoms_ase.pbc[0])
- iteration.set_attribute("periodic_boundary_conditions_y",
- atoms_ase.pbc[1])
- iteration.set_attribute("periodic_boundary_conditions_z",
- atoms_ase.pbc[2])
+ (
+ atomic_positions[0].dtype
+ if io.__version__ >= "0.15.0"
+ else io.Datatype.DOUBLE
+ ),
+ atomic_positions[0].shape,
+ )
+ numbers = io.Dataset(atomic_numbers[0].dtype, [1])
+ iteration.set_attribute(
+ "periodic_boundary_conditions_x", atoms_ase.pbc[0]
+ )
+ iteration.set_attribute(
+ "periodic_boundary_conditions_y", atoms_ase.pbc[1]
+ )
+ iteration.set_attribute(
+ "periodic_boundary_conditions_z", atoms_ase.pbc[2]
+ )
# atoms_openpmd["position"].time_offset = 0.0
# atoms_openpmd["positionOffset"].time_offset = 0.0
for atom in range(0, len(atoms_ase)):
atoms_openpmd["position"][str(atom)].reset_dataset(positions)
atoms_openpmd["number"][str(atom)].reset_dataset(numbers)
- atoms_openpmd["positionOffset"][str(atom)].reset_dataset(positions)
+ atoms_openpmd["positionOffset"][str(atom)].reset_dataset(
+ positions
+ )
atoms_openpmd_position = atoms_openpmd["position"][str(atom)]
atoms_openpmd_number = atoms_openpmd["number"][str(atom)]
if get_rank() == 0:
atoms_openpmd_position.store_chunk(atomic_positions[atom])
atoms_openpmd_number.store_chunk(
- np.array([atomic_numbers[atom]]))
+ np.array([atomic_numbers[atom]])
+ )
atoms_openpmd["positionOffset"][str(atom)].make_constant(0)
# Positions are stored in Angstrom.
atoms_openpmd["position"][str(atom)].unit_SI = 1.0e-10
atoms_openpmd["positionOffset"][str(atom)].unit_SI = 1.0e-10
- dataset = array.dataset if isinstance(
- array, self.SkipArrayWriting) else io.Dataset(
- array.dtype, self.grid_dimensions)
+ dataset = (
+ array.dataset
+ if isinstance(array, self.SkipArrayWriting)
+ else io.Dataset(array.dtype, self.grid_dimensions)
+ )
# Global feature sizes:
feature_global_from = 0
@@ -592,11 +662,14 @@ Source code for mala.common.physical_data
feature_to = array.shape[3]
if feature_to - feature_from != array.shape[3]:
- raise RuntimeError("""\
-[write_to_openpmd_iteration] Internal error, called function with
-wrong parameters. Specification of features ({} - {}) on rank {} does not
-match the array dimensions (extent {} in the feature dimension)""".format(
- feature_from, feature_to, get_rank(), array.shape[3]))
+ raise RuntimeError(
+ """\
+[write_to_openpmd_iteration] Internal error, called function with
+wrong parameters. Specification of features ({} - {}) on rank {} does not
+match the array dimensions (extent {} in the feature dimension)""".format(
+ feature_from, feature_to, get_rank(), array.shape[3]
+ )
+ )
# See above - will currently break for density of states,
# which is something we never do though anyway.
@@ -614,9 +687,11 @@ Source code for mala.common.physical_data
# features are written from all ranks.
if self.parameters._configuration["mpi"]:
from mpi4py import MPI
+
my_iteration_count = len(range(0, array.shape[3], granularity))
- highest_iteration_count = get_comm().allreduce(my_iteration_count,
- op=MPI.MAX)
+ highest_iteration_count = get_comm().allreduce(
+ my_iteration_count, op=MPI.MAX
+ )
extra_flushes = highest_iteration_count - my_iteration_count
else:
extra_flushes = 0
@@ -624,8 +699,9 @@ Source code for mala.common.physical_data
# Second loop: Write heavy data
for base in range(0, array.shape[3], granularity):
end = min(base + granularity, array.shape[3])
- transposed = \
- np.transpose(array[:, :, :, base:end], axes=[3, 0, 1, 2]).copy()
+ transposed = np.transpose(
+ array[:, :, :, base:end], axes=[3, 0, 1, 2]
+ ).copy()
for i in range(base, end):
# i is the index within the array passed to this function.
# The feature corresponding to this index is offset
@@ -633,8 +709,9 @@ Source code for mala.common.physical_data
current_feature = i + feature_from
mesh_component = mesh[str(current_feature)]
- mesh_component[x_from:x_to, y_from:y_to, z_from:z_to] = \
+ mesh_component[x_from:x_to, y_from:y_to, z_from:z_to] = (
transposed[i - base, :, :, :]
+ )
iteration.series_flush()
@@ -679,9 +756,9 @@ Source code for mala.common.physical_data
# MALA internally operates in Angstrom (10^-10 m)
mesh.grid_unit_SI = 1e-10
- mesh.comment = \
- "This is a special geometry, " \
- "based on the cartesian geometry."
+ mesh.comment = (
+ "This is a special geometry, based on the cartesian geometry."
+ )
# Fill geometry information (if provided)
self._set_geometry_info(mesh)
@@ -698,8 +775,9 @@ Source code for mala.common.physical_data
return None
@staticmethod
- def _get_attribute_if_attribute_exists(iteration, attribute,
- default_value=None):
+ def _get_attribute_if_attribute_exists(
+ iteration, attribute, default_value=None
+ ):
if attribute in iteration.attributes:
return iteration.get_attribute(attribute)
else:
diff --git a/_modules/mala/datageneration/ofdft_initializer.html b/_modules/mala/datageneration/ofdft_initializer.html
index 9844edf9c..24882061a 100644
--- a/_modules/mala/datageneration/ofdft_initializer.html
+++ b/_modules/mala/datageneration/ofdft_initializer.html
@@ -75,6 +75,7 @@
Source code for mala.datageneration.ofdft_initializer
"""Tools for initializing a (ML)-DFT trajectory with OF-DFT."""
+
from warnings import warn
from ase import units
@@ -83,6 +84,7 @@ Source code for mala.datageneration.ofdft_initializer
from ase.md.langevin import Langevin
from ase.io.trajectory import Trajectory
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
+
try:
from dftpy.api.api4ase import DFTpyCalculator
from dftpy.config import DefaultOption, OptionFormat
@@ -105,25 +107,29 @@ Source code for mala.datageneration.ofdft_initializer
"""
def __init__(self, parameters, atoms):
- warn("The class OFDFTInitializer is experimental. The algorithms "
- "within have been tested, but the API may still be subject to "
- "large changes.")
+ warn(
+ "The class OFDFTInitializer is experimental. The algorithms "
+ "within have been tested, but the API may still be subject to "
+ "large changes."
+ )
self.atoms = atoms
self.params = parameters.datageneration
# Check that only one element is used in the atoms.
number_of_elements = len(set([x.symbol for x in self.atoms]))
if number_of_elements > 1:
- raise Exception("OF-DFT-MD initialization can only work with one"
- " element.")
+ raise Exception(
+ "OF-DFT-MD initialization can only work with one element."
+ )
self.dftpy_configuration = DefaultOption()
- self.dftpy_configuration['PATH']['pppath'] = self.params.local_psp_path
- self.dftpy_configuration['PP'][self.atoms[0].symbol] = \
- self.params.local_psp_name
- self.dftpy_configuration['OPT']['method'] = self.params.ofdft_kedf
- self.dftpy_configuration['KEDF']['kedf'] = 'WT'
- self.dftpy_configuration['JOB']['calctype'] = 'Energy Force'
+ self.dftpy_configuration["PATH"]["pppath"] = self.params.local_psp_path
+ self.dftpy_configuration["PP"][
+ self.atoms[0].symbol
+ ] = self.params.local_psp_name
+ self.dftpy_configuration["OPT"]["method"] = self.params.ofdft_kedf
+ self.dftpy_configuration["KEDF"]["kedf"] = "WT"
+ self.dftpy_configuration["JOB"]["calctype"] = "Energy Force"
[docs] def get_equilibrated_configuration(self, logging_period=None):
"""
@@ -143,20 +149,33 @@ Source code for mala.datageneration.ofdft_initializer
self.atoms.set_calculator(calc)
# Create the initial velocities, and dynamics object.
- MaxwellBoltzmannDistribution(self.atoms,
- temperature_K=
- self.params.ofdft_temperature,
- force_temp=True)
- dyn = Langevin(self.atoms, self.params.ofdft_timestep * units.fs,
- temperature_K=self.params.ofdft_temperature,
- friction=self.params.ofdft_friction)
+ MaxwellBoltzmannDistribution(
+ self.atoms,
+ temperature_K=self.params.ofdft_temperature,
+ force_temp=True,
+ )
+ dyn = Langevin(
+ self.atoms,
+ self.params.ofdft_timestep * units.fs,
+ temperature_K=self.params.ofdft_temperature,
+ friction=self.params.ofdft_friction,
+ )
# If logging is desired, do the logging.
if logging_period is not None:
- dyn.attach(MDLogger(dyn, self.atoms, 'mala_of_dft_md.log',
- header=False, stress=False, peratom=True,
- mode="w"), interval=logging_period)
- traj = Trajectory('mala_of_dft_md.traj', 'w', self.atoms)
+ dyn.attach(
+ MDLogger(
+ dyn,
+ self.atoms,
+ "mala_of_dft_md.log",
+ header=False,
+ stress=False,
+ peratom=True,
+ mode="w",
+ ),
+ interval=logging_period,
+ )
+ traj = Trajectory("mala_of_dft_md.traj", "w", self.atoms)
dyn.attach(traj.write, interval=logging_period)
diff --git a/_modules/mala/datageneration/trajectory_analyzer.html b/_modules/mala/datageneration/trajectory_analyzer.html
index aa82df007..d3ac61840 100644
--- a/_modules/mala/datageneration/trajectory_analyzer.html
+++ b/_modules/mala/datageneration/trajectory_analyzer.html
@@ -75,6 +75,7 @@
Source code for mala.datageneration.trajectory_analyzer
"""Tools for analyzing a trajectory."""
+
from functools import cached_property
import os
from warnings import warn
@@ -106,12 +107,20 @@ Source code for mala.datageneration.trajectory_analyzer
one will be generated ad-hoc (recommended).
"""
- def __init__(self, parameters, trajectory, temperatures=None,
- target_calculator=None, target_temperature=None,
- malada_compatability=False):
- warn("The class TrajectoryAnalyzer is experimental. The algorithms "
- "within have been tested, but the API may still be subject to "
- "large changes.")
+ def __init__(
+ self,
+ parameters,
+ trajectory,
+ temperatures=None,
+ target_calculator=None,
+ target_temperature=None,
+ malada_compatability=False,
+ ):
+ warn(
+ "The class TrajectoryAnalyzer is experimental. The algorithms "
+ "within have been tested, but the API may still be subject to "
+ "large changes."
+ )
self.params: ParametersDataGeneration = parameters.datageneration
@@ -187,8 +196,9 @@ Source code for mala.datageneration.trajectory_analyzer
"""Cutoff for the snapshot correlation analysis."""
return self.get_snapshot_correlation_cutoff()
-[docs] def get_first_snapshot(self, equilibrated_snapshot=None,
- distance_threshold=None):
+[docs] def get_first_snapshot(
+ self, equilibrated_snapshot=None, distance_threshold=None
+ ):
"""
Calculate distance metrics/first equilibrated timestep on a trajectory.
@@ -220,39 +230,55 @@ Source code for mala.datageneration.trajectory_analyzer
if equilibrated_snapshot is None:
equilibrated_snapshot = self.trajectory[-1]
for idx, step in enumerate(self.trajectory):
- self.distance_metrics.append(self.
- _calculate_distance_between_snapshots
- (equilibrated_snapshot, step, "rdf",
- "cosine_distance", save_rdf1=True))
+ self.distance_metrics.append(
+ self._calculate_distance_between_snapshots(
+ equilibrated_snapshot,
+ step,
+ "rdf",
+ "cosine_distance",
+ save_rdf1=True,
+ )
+ )
# Now, we denoise the distance metrics.
self.distance_metrics_denoised = self.__denoise(self.distance_metrics)
# Which snapshots are considered depends on how we denoise the
# distance metrics.
- self.first_considered_snapshot = \
- self.params.trajectory_analysis_denoising_width
- self.last_considered_snapshot = \
- np.shape(self.distance_metrics_denoised)[0]-\
+ self.first_considered_snapshot = (
self.params.trajectory_analysis_denoising_width
- considered_length = self.last_considered_snapshot - \
- self.first_considered_snapshot
+ )
+ self.last_considered_snapshot = (
+ np.shape(self.distance_metrics_denoised)[0]
+ - self.params.trajectory_analysis_denoising_width
+ )
+ considered_length = (
+ self.last_considered_snapshot - self.first_considered_snapshot
+ )
# Next, the average of the presumed equilibrated part is calculated,
# and then the first N number of times teps which are below this
# average is calculated.
self.average_distance_equilibrated = distance_threshold
if self.average_distance_equilibrated is None:
- self.average_distance_equilibrated = \
- np.mean(self.distance_metrics_denoised[considered_length -
- int(self.params.trajectory_analysis_estimated_equilibrium * considered_length):
- self.last_considered_snapshot])
+ self.average_distance_equilibrated = np.mean(
+ self.distance_metrics_denoised[
+ considered_length
+ - int(
+ self.params.trajectory_analysis_estimated_equilibrium
+ * considered_length
+ ) : self.last_considered_snapshot
+ ]
+ )
is_below = True
counter = 0
first_snapshot = None
for idx, dist in enumerate(self.distance_metrics_denoised):
- if self.first_considered_snapshot <= idx \
- <= self.last_considered_snapshot:
+ if (
+ self.first_considered_snapshot
+ <= idx
+ <= self.last_considered_snapshot
+ ):
if is_below:
counter += 1
if dist < self.average_distance_equilibrated:
@@ -260,12 +286,16 @@ Source code for mala.datageneration.trajectory_analyzer
if dist >= self.average_distance_equilibrated:
counter = 0
is_below = False
- if counter == self.params.\
- trajectory_analysis_below_average_counter:
+ if (
+ counter
+ == self.params.trajectory_analysis_below_average_counter
+ ):
first_snapshot = idx
break
- printout("First equilibrated timestep of trajectory is", first_snapshot)
+ printout(
+ "First equilibrated timestep of trajectory is", first_snapshot
+ )
return first_snapshot
[docs] def get_snapshot_correlation_cutoff(self):
@@ -307,100 +337,134 @@ Source code for mala.datageneration.trajectory_analyzer
filename_uncorrelated_snapshots : string
Name of the file in which to save the uncorrelated snapshots.
"""
- filename_base = \
- os.path.basename(filename_uncorrelated_snapshots).split(".")[0]
- allowed_temp_diff_K = (self.params.
- trajectory_analysis_temperature_tolerance_percent
- / 100) * self.target_calculator.temperature
+ filename_base = os.path.basename(
+ filename_uncorrelated_snapshots
+ ).split(".")[0]
+ allowed_temp_diff_K = (
+ self.params.trajectory_analysis_temperature_tolerance_percent / 100
+ ) * self.target_calculator.temperature
current_snapshot = self.first_snapshot
- begin_snapshot = self.first_snapshot+1
+ begin_snapshot = self.first_snapshot + 1
end_snapshot = len(self.trajectory)
j = 0
md_iteration = []
for i in range(begin_snapshot, end_snapshot):
- if self.__check_if_snapshot_is_valid(self.trajectory[i],
- self.temperatures[i],
- self.trajectory[current_snapshot],
- self.temperatures[current_snapshot],
- self.snapshot_correlation_cutoff,
- allowed_temp_diff_K):
+ if self.__check_if_snapshot_is_valid(
+ self.trajectory[i],
+ self.temperatures[i],
+ self.trajectory[current_snapshot],
+ self.temperatures[current_snapshot],
+ self.snapshot_correlation_cutoff,
+ allowed_temp_diff_K,
+ ):
current_snapshot = i
md_iteration.append(current_snapshot)
j += 1
np.random.shuffle(md_iteration)
for i in range(0, len(md_iteration)):
if i == 0:
- traj_writer = TrajectoryWriter(filename_base+".traj", mode='w')
+ traj_writer = TrajectoryWriter(
+ filename_base + ".traj", mode="w"
+ )
else:
- traj_writer = TrajectoryWriter(filename_base+".traj", mode='a')
- atoms_to_write = Descriptor.enforce_pbc(self.trajectory[md_iteration[i]])
+ traj_writer = TrajectoryWriter(
+ filename_base + ".traj", mode="a"
+ )
+ atoms_to_write = Descriptor.enforce_pbc(
+ self.trajectory[md_iteration[i]]
+ )
traj_writer.write(atoms=atoms_to_write)
- np.save(filename_base+"_numbers.npy", md_iteration)
+ np.save(filename_base + "_numbers.npy", md_iteration)
printout(j, "possible snapshots found in MD trajectory.")
def _analyze_distance_metric(self, trajectory):
# distance metric usefdfor the snapshot parsing (realspace similarity
# of the snapshot), we first find the center of the equilibrated part
# of the trajectory and calculate the differences w.r.t to to it.
- center = int((np.shape(self.distance_metrics_denoised)[
- 0] - self.first_snapshot) / 2) + self.first_snapshot
+ center = (
+ int(
+ (
+ np.shape(self.distance_metrics_denoised)[0]
+ - self.first_snapshot
+ )
+ / 2
+ )
+ + self.first_snapshot
+ )
width = int(
- self.params.trajectory_analysis_estimated_equilibrium *
- np.shape(self.distance_metrics_denoised)[0])
+ self.params.trajectory_analysis_estimated_equilibrium
+ * np.shape(self.distance_metrics_denoised)[0]
+ )
self.distances_realspace = []
self.__saved_rdf = None
for i in range(center - width, center + width):
self.distances_realspace.append(
self._calculate_distance_between_snapshots(
- trajectory[center], trajectory[i],
- "realspace", "minimal_distance", save_rdf1=True))
+ trajectory[center],
+ trajectory[i],
+ "realspace",
+ "minimal_distance",
+ save_rdf1=True,
+ )
+ )
# From these metrics, we assume mean - 2.576 std as limit.
# This translates to a confidence interval of ~99%, which should
# make any coincidental similarites unlikely.
cutoff = np.mean(self.distances_realspace) - 2.576 * np.std(
- self.distances_realspace)
+ self.distances_realspace
+ )
printout("Distance metric cutoff is", cutoff)
return cutoff
- def _calculate_distance_between_snapshots(self, snapshot1, snapshot2,
- distance_metric, reduction,
- save_rdf1=False):
+ def _calculate_distance_between_snapshots(
+ self,
+ snapshot1,
+ snapshot2,
+ distance_metric,
+ reduction,
+ save_rdf1=False,
+ ):
if distance_metric == "realspace":
positions1 = snapshot1.get_positions()
positions2 = snapshot2.get_positions()
if reduction == "minimal_distance":
- result = np.amin(distance.cdist(positions1, positions2),
- axis=0)
+ result = np.amin(
+ distance.cdist(positions1, positions2), axis=0
+ )
result = np.mean(result)
elif reduction == "cosine_distance":
number_of_atoms = snapshot1.get_number_of_atoms()
- result = distance.cosine(np.reshape(positions1,
- [number_of_atoms*3]),
- np.reshape(positions2,
- [number_of_atoms*3]))
+ result = distance.cosine(
+ np.reshape(positions1, [number_of_atoms * 3]),
+ np.reshape(positions2, [number_of_atoms * 3]),
+ )
else:
raise Exception("Unknown distance metric reduction.")
elif distance_metric == "rdf":
if save_rdf1 is True:
if self.__saved_rdf is None:
- self.__saved_rdf = self.target_calculator.\
- get_radial_distribution_function(snapshot1,
- method="asap3")[0]
+ self.__saved_rdf = self.target_calculator.get_radial_distribution_function(
+ snapshot1, method="asap3"
+ )[
+ 0
+ ]
rdf1 = self.__saved_rdf
else:
- rdf1 = self.target_calculator.\
- get_radial_distribution_function(snapshot1,
- method="asap3")[0]
- rdf2 = self.target_calculator.\
- get_radial_distribution_function(snapshot2,
- method="asap3")[0]
+ rdf1 = self.target_calculator.get_radial_distribution_function(
+ snapshot1, method="asap3"
+ )[0]
+ rdf2 = self.target_calculator.get_radial_distribution_function(
+ snapshot2, method="asap3"
+ )[0]
if reduction == "minimal_distance":
- raise Exception("Combination of distance metric and reduction "
- "not supported.")
+ raise Exception(
+ "Combination of distance metric and reduction "
+ "not supported."
+ )
elif reduction == "cosine_distance":
result = distance.cosine(rdf1, rdf2)
@@ -413,29 +477,34 @@ Source code for mala.datageneration.trajectory_analyzer
return result
def __denoise(self, signal):
- denoised_signal = np.convolve(signal, np.ones(
- self.params.trajectory_analysis_denoising_width)
- / self.params.
- trajectory_analysis_denoising_width,
- mode='same')
+ denoised_signal = np.convolve(
+ signal,
+ np.ones(self.params.trajectory_analysis_denoising_width)
+ / self.params.trajectory_analysis_denoising_width,
+ mode="same",
+ )
return denoised_signal
- def __check_if_snapshot_is_valid(self, snapshot_to_test, temp_to_test,
- reference_snapshot, reference_temp,
- distance_metric,
- allowed_temp_diff):
- distance = self.\
- _calculate_distance_between_snapshots(snapshot_to_test,
- reference_snapshot,
- "realspace",
- "minimal_distance")
- temp_diff = np.abs(temp_to_test-reference_temp)
+ def __check_if_snapshot_is_valid(
+ self,
+ snapshot_to_test,
+ temp_to_test,
+ reference_snapshot,
+ reference_temp,
+ distance_metric,
+ allowed_temp_diff,
+ ):
+ distance = self._calculate_distance_between_snapshots(
+ snapshot_to_test,
+ reference_snapshot,
+ "realspace",
+ "minimal_distance",
+ )
+ temp_diff = np.abs(temp_to_test - reference_temp)
if distance > distance_metric and temp_diff < allowed_temp_diff:
return True
else:
return False
-
-
diff --git a/_modules/mala/datahandling/data_converter.html b/_modules/mala/datahandling/data_converter.html
index 9a6c42bba..bfef50be1 100644
--- a/_modules/mala/datahandling/data_converter.html
+++ b/_modules/mala/datahandling/data_converter.html
@@ -75,6 +75,7 @@
Source code for mala.datahandling.data_converter
"""DataConverter class for converting snapshots into numpy arrays."""
+
import os
import json
@@ -85,15 +86,9 @@ Source code for mala.datahandling.data_converter
from mala.targets.target import Target
from mala.version import __version__ as mala_version
-descriptor_input_types = [
- "espresso-out"
-]
-target_input_types = [
- ".cube", ".xsf"
-]
-additional_info_input_types = [
- "espresso-out"
-]
+descriptor_input_types = ["espresso-out"]
+target_input_types = [".cube", ".xsf"]
+additional_info_input_types = ["espresso-out"]
[docs]class DataConverter:
@@ -126,8 +121,9 @@ Source code for mala.datahandling.data_converter
Target calculator used for parsing/converting target data.
"""
- def __init__(self, parameters, descriptor_calculator=None,
- target_calculator=None):
+ def __init__(
+ self, parameters, descriptor_calculator=None, target_calculator=None
+ ):
self.parameters: ParametersData = parameters.data
self.parameters_full = parameters
self.target_calculator = target_calculator
@@ -140,8 +136,9 @@ Source code for mala.datahandling.data_converter
if parameters.descriptors.use_z_splitting:
parameters.descriptors.use_z_splitting = False
- printout("Disabling z-splitting for preprocessing.",
- min_verbosity=0)
+ printout(
+ "Disabling z-splitting for preprocessing.", min_verbosity=0
+ )
self.__snapshots_to_convert = []
self.__snapshot_description = []
@@ -152,16 +149,19 @@ Source code for mala.datahandling.data_converter
self.process_targets = False
self.process_additional_info = False
-[docs] def add_snapshot(self, descriptor_input_type=None,
- descriptor_input_path=None,
- target_input_type=None,
- target_input_path=None,
- additional_info_input_type=None,
- additional_info_input_path=None,
- descriptor_units=None,
- metadata_input_type=None,
- metadata_input_path=None,
- target_units=None):
+[docs] def add_snapshot(
+ self,
+ descriptor_input_type=None,
+ descriptor_input_path=None,
+ target_input_type=None,
+ target_input_path=None,
+ additional_info_input_type=None,
+ additional_info_input_path=None,
+ descriptor_units=None,
+ metadata_input_type=None,
+ metadata_input_path=None,
+ target_units=None,
+ ):
"""
Add a snapshot to be processed.
@@ -215,17 +215,17 @@ Source code for mala.datahandling.data_converter
if descriptor_input_type is not None:
if descriptor_input_path is None:
raise Exception(
- "Cannot process descriptor data with no path "
- "given.")
+ "Cannot process descriptor data with no path given."
+ )
if descriptor_input_type not in descriptor_input_types:
- raise Exception(
- "Cannot process this type of descriptor data.")
+ raise Exception("Cannot process this type of descriptor data.")
self.process_descriptors = True
if target_input_type is not None:
if target_input_path is None:
- raise Exception("Cannot process target data with no path "
- "given.")
+ raise Exception(
+ "Cannot process target data with no path given."
+ )
if target_input_type not in target_input_types:
raise Exception("Cannot process this type of target data.")
self.process_targets = True
@@ -233,48 +233,63 @@ Source code for mala.datahandling.data_converter
if additional_info_input_type is not None:
metadata_input_type = additional_info_input_type
if additional_info_input_path is None:
- raise Exception("Cannot process additional info data with "
- "no path given.")
+ raise Exception(
+ "Cannot process additional info data with "
+ "no path given."
+ )
if additional_info_input_type not in additional_info_input_types:
raise Exception(
- "Cannot process this type of additional info "
- "data.")
+ "Cannot process this type of additional info data."
+ )
self.process_additional_info = True
metadata_input_path = additional_info_input_path
if metadata_input_type is not None:
if metadata_input_path is None:
- raise Exception("Cannot process additional info data with "
- "no path given.")
+ raise Exception(
+ "Cannot process additional info data with "
+ "no path given."
+ )
if metadata_input_type not in additional_info_input_types:
raise Exception(
- "Cannot process this type of additional info "
- "data.")
+ "Cannot process this type of additional info data."
+ )
# Assign info.
- self.__snapshots_to_convert.append({"input": descriptor_input_path,
- "output": target_input_path,
- "additional_info":
- additional_info_input_path,
- "metadata": metadata_input_path})
- self.__snapshot_description.append({"input": descriptor_input_type,
- "output": target_input_type,
- "additional_info":
- additional_info_input_type,
- "metadata": metadata_input_type})
- self.__snapshot_units.append({"input": descriptor_units,
- "output": target_units})
-
-[docs] def convert_snapshots(self, complete_save_path=None,
- descriptor_save_path=None,
- target_save_path=None,
- additional_info_save_path=None,
- naming_scheme="ELEM_snapshot*.npy", starts_at=0,
- file_based_communication=False,
- descriptor_calculation_kwargs=None,
- target_calculator_kwargs=None,
- use_fp64=False):
+ self.__snapshots_to_convert.append(
+ {
+ "input": descriptor_input_path,
+ "output": target_input_path,
+ "additional_info": additional_info_input_path,
+ "metadata": metadata_input_path,
+ }
+ )
+ self.__snapshot_description.append(
+ {
+ "input": descriptor_input_type,
+ "output": target_input_type,
+ "additional_info": additional_info_input_type,
+ "metadata": metadata_input_type,
+ }
+ )
+ self.__snapshot_units.append(
+ {"input": descriptor_units, "output": target_units}
+ )
+
+[docs] def convert_snapshots(
+ self,
+ complete_save_path=None,
+ descriptor_save_path=None,
+ target_save_path=None,
+ additional_info_save_path=None,
+ naming_scheme="ELEM_snapshot*.npy",
+ starts_at=0,
+ file_based_communication=False,
+ descriptor_calculation_kwargs=None,
+ target_calculator_kwargs=None,
+ use_fp64=False,
+ ):
"""
Convert the snapshots in the list to numpy arrays.
@@ -333,8 +348,9 @@ Source code for mala.datahandling.data_converter
import openpmd_api as io
if file_ending not in io.file_extensions:
- raise Exception("Invalid file ending selected: " +
- file_ending)
+ raise Exception(
+ "Invalid file ending selected: " + file_ending
+ )
else:
file_ending = "npy"
@@ -360,14 +376,24 @@ Source code for mala.datahandling.data_converter
additional_info_save_path = complete_save_path
else:
if self.process_targets is True and target_save_path is None:
- raise Exception("No target path specified, cannot process "
- "data.")
- if self.process_descriptors is True and descriptor_save_path is None:
- raise Exception("No descriptor path specified, cannot "
- "process data.")
- if self.process_additional_info is True and additional_info_save_path is None:
- raise Exception("No additional info path specified, cannot "
- "process data.")
+ raise Exception(
+ "No target path specified, cannot process data."
+ )
+ if (
+ self.process_descriptors is True
+ and descriptor_save_path is None
+ ):
+ raise Exception(
+ "No descriptor path specified, cannot process data."
+ )
+ if (
+ self.process_additional_info is True
+ and additional_info_save_path is None
+ ):
+ raise Exception(
+ "No additional info path specified, cannot "
+ "process data."
+ )
if file_ending != "npy":
snapshot_name = naming_scheme
@@ -376,19 +402,27 @@ Source code for mala.datahandling.data_converter
if self.process_descriptors:
if self.parameters._configuration["mpi"]:
input_series = io.Series(
- os.path.join(descriptor_save_path,
- series_name + ".in." + file_ending),
+ os.path.join(
+ descriptor_save_path,
+ series_name + ".in." + file_ending,
+ ),
io.Access.create,
get_comm(),
options=json.dumps(
- self.parameters_full.openpmd_configuration))
+ self.parameters_full.openpmd_configuration
+ ),
+ )
else:
input_series = io.Series(
- os.path.join(descriptor_save_path,
- series_name + ".in." + file_ending),
+ os.path.join(
+ descriptor_save_path,
+ series_name + ".in." + file_ending,
+ ),
io.Access.create,
options=json.dumps(
- self.parameters_full.openpmd_configuration))
+ self.parameters_full.openpmd_configuration
+ ),
+ )
input_series.set_attribute("is_mala_data", 1)
input_series.set_software(name="MALA", version="x.x.x")
input_series.author = "..."
@@ -396,19 +430,27 @@ Source code for mala.datahandling.data_converter
if self.process_targets:
if self.parameters._configuration["mpi"]:
output_series = io.Series(
- os.path.join(target_save_path,
- series_name + ".out." + file_ending),
+ os.path.join(
+ target_save_path,
+ series_name + ".out." + file_ending,
+ ),
io.Access.create,
get_comm(),
options=json.dumps(
- self.parameters_full.openpmd_configuration))
+ self.parameters_full.openpmd_configuration
+ ),
+ )
else:
output_series = io.Series(
- os.path.join(target_save_path,
- series_name + ".out." + file_ending),
+ os.path.join(
+ target_save_path,
+ series_name + ".out." + file_ending,
+ ),
io.Access.create,
options=json.dumps(
- self.parameters_full.openpmd_configuration))
+ self.parameters_full.openpmd_configuration
+ ),
+ )
output_series.set_attribute("is_mala_data", 1)
output_series.set_software(name="MALA", version=mala_version)
@@ -421,8 +463,9 @@ Source code for mala.datahandling.data_converter
# Create the paths as needed.
if self.process_additional_info:
- info_path = os.path.join(additional_info_save_path,
- snapshot_name + ".info.json")
+ info_path = os.path.join(
+ additional_info_save_path, snapshot_name + ".info.json"
+ )
else:
info_path = None
input_iteration = None
@@ -431,22 +474,27 @@ Source code for mala.datahandling.data_converter
if file_ending == "npy":
# Create the actual paths, if needed.
if self.process_descriptors:
- descriptor_path = os.path.join(descriptor_save_path,
- snapshot_name + ".in." +
- file_ending)
+ descriptor_path = os.path.join(
+ descriptor_save_path,
+ snapshot_name + ".in." + file_ending,
+ )
else:
descriptor_path = None
memmap = None
if self.process_targets:
- target_path = os.path.join(target_save_path,
- snapshot_name + ".out."+
- file_ending)
+ target_path = os.path.join(
+ target_save_path,
+ snapshot_name + ".out." + file_ending,
+ )
# A memory mapped file is used as buffer for distributed cases.
- if self.parameters._configuration["mpi"] and \
- file_based_communication:
- memmap = os.path.join(target_save_path, snapshot_name +
- ".out.npy_temp")
+ if (
+ self.parameters._configuration["mpi"]
+ and file_based_communication
+ ):
+ memmap = os.path.join(
+ target_save_path, snapshot_name + ".out.npy_temp"
+ )
else:
target_path = None
else:
@@ -454,27 +502,36 @@ Source code for mala.datahandling.data_converter
target_path = None
memmap = None
if self.process_descriptors:
- input_iteration = input_series.write_iterations()[i + starts_at]
+ input_iteration = input_series.write_iterations()[
+ i + starts_at
+ ]
input_iteration.dt = i + starts_at
input_iteration.time = 0
if self.process_targets:
- output_iteration = output_series.write_iterations()[i + starts_at]
+ output_iteration = output_series.write_iterations()[
+ i + starts_at
+ ]
output_iteration.dt = i + starts_at
output_iteration.time = 0
- self.__convert_single_snapshot(i, descriptor_calculation_kwargs,
- target_calculator_kwargs,
- input_path=descriptor_path,
- output_path=target_path,
- use_memmap=memmap,
- input_iteration=input_iteration,
- output_iteration=output_iteration,
- additional_info_path=info_path,
- use_fp64=use_fp64)
+ self.__convert_single_snapshot(
+ i,
+ descriptor_calculation_kwargs,
+ target_calculator_kwargs,
+ input_path=descriptor_path,
+ output_path=target_path,
+ use_memmap=memmap,
+ input_iteration=input_iteration,
+ output_iteration=output_iteration,
+ additional_info_path=info_path,
+ use_fp64=use_fp64,
+ )
if get_rank() == 0:
- if self.parameters._configuration["mpi"] \
- and file_based_communication:
+ if (
+ self.parameters._configuration["mpi"]
+ and file_based_communication
+ ):
os.remove(memmap)
# Properly close series
@@ -484,16 +541,19 @@ Source code for mala.datahandling.data_converter
if self.process_targets:
del output_series
- def __convert_single_snapshot(self, snapshot_number,
- descriptor_calculation_kwargs,
- target_calculator_kwargs,
- input_path=None,
- output_path=None,
- additional_info_path=None,
- use_memmap=None,
- output_iteration=None,
- input_iteration=None,
- use_fp64=False):
+ def __convert_single_snapshot(
+ self,
+ snapshot_number,
+ descriptor_calculation_kwargs,
+ target_calculator_kwargs,
+ input_path=None,
+ output_path=None,
+ additional_info_path=None,
+ use_memmap=None,
+ output_iteration=None,
+ input_iteration=None,
+ use_fp64=False,
+ ):
"""
Convert single snapshot from the conversion lists.
@@ -557,39 +617,49 @@ Source code for mala.datahandling.data_converter
descriptor_calculation_kwargs["units"] = original_units["input"]
descriptor_calculation_kwargs["use_fp64"] = use_fp64
- tmp_input, local_size = self.descriptor_calculator. \
- calculate_from_qe_out(snapshot["input"],
- **descriptor_calculation_kwargs)
+ tmp_input, local_size = (
+ self.descriptor_calculator.calculate_from_qe_out(
+ snapshot["input"], **descriptor_calculation_kwargs
+ )
+ )
elif description["input"] is None:
# In this case, only the output is processed.
pass
else:
- raise Exception("Unknown file extension, cannot convert descriptor")
+ raise Exception(
+ "Unknown file extension, cannot convert descriptor."
+ )
if description["input"] is not None:
# Save data and delete, if not requested otherwise.
if input_path is not None and input_iteration is None:
if self.parameters._configuration["mpi"]:
- tmp_input = self.descriptor_calculator. \
- gather_descriptors(tmp_input)
+ tmp_input = self.descriptor_calculator.gather_descriptors(
+ tmp_input
+ )
if get_rank() == 0:
- self.descriptor_calculator.\
- write_to_numpy_file(input_path, tmp_input)
+ self.descriptor_calculator.write_to_numpy_file(
+ input_path, tmp_input
+ )
else:
if self.parameters._configuration["mpi"]:
- tmp_input, local_offset, local_reach = \
- self.descriptor_calculator.convert_local_to_3d(tmp_input)
- self.descriptor_calculator. \
- write_to_openpmd_iteration(input_iteration,
- tmp_input,
- local_offset=local_offset,
- local_reach=local_reach)
+ tmp_input, local_offset, local_reach = (
+ self.descriptor_calculator.convert_local_to_3d(
+ tmp_input
+ )
+ )
+ self.descriptor_calculator.write_to_openpmd_iteration(
+ input_iteration,
+ tmp_input,
+ local_offset=local_offset,
+ local_reach=local_reach,
+ )
else:
- self.descriptor_calculator. \
- write_to_openpmd_iteration(input_iteration,
- tmp_input)
+ self.descriptor_calculator.write_to_openpmd_iteration(
+ input_iteration, tmp_input
+ )
del tmp_input
###########
@@ -601,25 +671,27 @@ Source code for mala.datahandling.data_converter
# Parse and/or calculate the output descriptors.
if description["output"] == ".cube":
target_calculator_kwargs["units"] = original_units[
- "output"]
+ "output"
+ ]
target_calculator_kwargs["use_memmap"] = use_memmap
target_calculator_kwargs["use_fp64"] = use_fp64
# If no units are provided we just assume standard units.
- tmp_output = self.target_calculator. \
- read_from_cube(snapshot["output"],
- **target_calculator_kwargs)
+ tmp_output = self.target_calculator.read_from_cube(
+ snapshot["output"], **target_calculator_kwargs
+ )
elif description["output"] == ".xsf":
target_calculator_kwargs["units"] = original_units[
- "output"]
+ "output"
+ ]
target_calculator_kwargs["use_memmap"] = use_memmap
target_calculator_kwargs["use_fp664"] = use_fp64
# If no units are provided we just assume standard units.
- tmp_output = self.target_calculator. \
- read_from_xsf(snapshot["output"],
- **target_calculator_kwargs)
+ tmp_output = self.target_calculator.read_from_xsf(
+ snapshot["output"], **target_calculator_kwargs
+ )
elif description["output"] is None:
# In this case, only the input is processed.
@@ -627,37 +699,39 @@ Source code for mala.datahandling.data_converter
else:
raise Exception(
- "Unknown file extension, cannot convert target"
- "data.")
+ "Unknown file extension, cannot convert target data."
+ )
if get_rank() == 0:
- self.target_calculator.write_to_numpy_file(output_path,
- tmp_output)
+ self.target_calculator.write_to_numpy_file(
+ output_path, tmp_output
+ )
else:
metadata = None
if description["metadata"] is not None:
- metadata = [snapshot["metadata"],
- description["metadata"]]
+ metadata = [snapshot["metadata"], description["metadata"]]
# Parse and/or calculate the output descriptors.
if self.parameters._configuration["mpi"]:
target_calculator_kwargs["return_local"] = True
if description["output"] == ".cube":
target_calculator_kwargs["units"] = original_units[
- "output"]
+ "output"
+ ]
target_calculator_kwargs["use_memmap"] = use_memmap
# If no units are provided we just assume standard units.
- tmp_output = self.target_calculator. \
- read_from_cube(snapshot["output"],
- **target_calculator_kwargs)
+ tmp_output = self.target_calculator.read_from_cube(
+ snapshot["output"], **target_calculator_kwargs
+ )
elif description["output"] == ".xsf":
target_calculator_kwargs["units"] = original_units[
- "output"]
+ "output"
+ ]
target_calculator_kwargs["use_memmap"] = use_memmap
# If no units are provided we just assume standard units.
- tmp_output = self.target_calculator. \
- read_from_xsf(snapshot["output"],
- **target_calculator_kwargs)
+ tmp_output = self.target_calculator.read_from_xsf(
+ snapshot["output"], **target_calculator_kwargs
+ )
elif description["output"] is None:
# In this case, only the input is processed.
@@ -665,31 +739,34 @@ Source code for mala.datahandling.data_converter
else:
raise Exception(
- "Unknown file extension, cannot convert target"
- "data.")
+ "Unknown file extension, cannot convert target data."
+ )
if self.parameters._configuration["mpi"]:
- self.target_calculator. \
- write_to_openpmd_iteration(output_iteration,
- tmp_output[0],
- feature_from=tmp_output[1],
- feature_to=tmp_output[2],
- additional_metadata=metadata)
+ self.target_calculator.write_to_openpmd_iteration(
+ output_iteration,
+ tmp_output[0],
+ feature_from=tmp_output[1],
+ feature_to=tmp_output[2],
+ additional_metadata=metadata,
+ )
else:
- self.target_calculator. \
- write_to_openpmd_iteration(output_iteration,
- tmp_output,
- additional_metadata=metadata)
+ self.target_calculator.write_to_openpmd_iteration(
+ output_iteration,
+ tmp_output,
+ additional_metadata=metadata,
+ )
del tmp_output
# Parse and/or calculate the additional info.
if description["additional_info"] is not None:
# Parsing and saving is done using the target calculator.
- self.target_calculator. \
- read_additional_calculation_data(snapshot["additional_info"],
- description["additional_info"])
- self.target_calculator. \
- write_additional_calculation_data(additional_info_path)
+ self.target_calculator.read_additional_calculation_data(
+ snapshot["additional_info"], description["additional_info"]
+ )
+ self.target_calculator.write_additional_calculation_data(
+ additional_info_path
+ )
diff --git a/_modules/mala/datahandling/data_handler.html b/_modules/mala/datahandling/data_handler.html
index 48c167b85..1c02e9c93 100644
--- a/_modules/mala/datahandling/data_handler.html
+++ b/_modules/mala/datahandling/data_handler.html
@@ -75,13 +75,9 @@
Source code for mala.datahandling.data_handler
"""DataHandler class that loads and scales data."""
+
import os
-try:
- import horovod.torch as hvd
-except ModuleNotFoundError:
- # Warning is thrown by Parameters class
- pass
import numpy as np
import torch
from torch.utils.data import TensorDataset
@@ -133,25 +129,34 @@ Source code for mala.datahandling.data_handler
# Constructors
##############################
- def __init__(self, parameters: Parameters, target_calculator=None,
- descriptor_calculator=None, input_data_scaler=None,
- output_data_scaler=None, clear_data=True):
- super(DataHandler, self).__init__(parameters,
- target_calculator=target_calculator,
- descriptor_calculator=
- descriptor_calculator)
- # Data will be scaled per user specification.
+ def __init__(
+ self,
+ parameters: Parameters,
+ target_calculator=None,
+ descriptor_calculator=None,
+ input_data_scaler=None,
+ output_data_scaler=None,
+ clear_data=True,
+ ):
+ super(DataHandler, self).__init__(
+ parameters,
+ target_calculator=target_calculator,
+ descriptor_calculator=descriptor_calculator,
+ )
+ # Data will be scaled per user specification.
self.input_data_scaler = input_data_scaler
if self.input_data_scaler is None:
- self.input_data_scaler \
- = DataScaler(self.parameters.input_rescaling_type,
- use_horovod=self.use_horovod)
+ self.input_data_scaler = DataScaler(
+ self.parameters.input_rescaling_type,
+ use_horovod=self.use_horovod,
+ )
self.output_data_scaler = output_data_scaler
if self.output_data_scaler is None:
- self.output_data_scaler \
- = DataScaler(self.parameters.output_rescaling_type,
- use_horovod=self.use_horovod)
+ self.output_data_scaler = DataScaler(
+ self.parameters.output_rescaling_type,
+ use_horovod=self.use_horovod,
+ )
# Actual data points in the different categories.
self.nr_training_data = 0
@@ -233,8 +238,10 @@
Source code for mala.datahandling.data_handler
# Do a consistency check of the snapshots so that we don't run into
# an error later. If there is an error, check_snapshots() will raise
# an exception.
- printout("Checking the snapshots and your inputs for consistency.",
- min_verbosity=1)
+ printout(
+ "Checking the snapshots and your inputs for consistency.",
+ min_verbosity=1,
+ )
self._check_snapshots()
printout("Consistency check successful.", min_verbosity=0)
@@ -243,22 +250,30 @@
Source code for mala.datahandling.data_handler
# than we can definitely not reparametrize the DataScalers.
if self.nr_training_data == 0:
reparametrize_scaler = False
- if self.input_data_scaler.cantransform is False or \
- self.output_data_scaler.cantransform is False:
- raise Exception("In inference mode, the DataHandler needs "
- "parametrized DataScalers, "
- "while you provided unparametrized "
- "DataScalers.")
+ if (
+ self.input_data_scaler.cantransform is False
+ or self.output_data_scaler.cantransform is False
+ ):
+ raise Exception(
+ "In inference mode, the DataHandler needs "
+ "parametrized DataScalers, "
+ "while you provided unparametrized "
+ "DataScalers."
+ )
# Parametrize the scalers, if needed.
if reparametrize_scaler:
printout("Initializing the data scalers.", min_verbosity=1)
self.__parametrize_scalers()
printout("Data scalers initialized.", min_verbosity=0)
- elif self.parameters.use_lazy_loading is False and \
- self.nr_training_data != 0:
- printout("Data scalers already initilized, loading data to RAM.",
- min_verbosity=0)
+ elif (
+ self.parameters.use_lazy_loading is False
+ and self.nr_training_data != 0
+ ):
+ printout(
+ "Data scalers already initilized, loading data to RAM.",
+ min_verbosity=0,
+ )
self.__load_data("training", "inputs")
self.__load_data("training", "outputs")
@@ -325,17 +340,21 @@
Source code for mala.datahandling.data_handler
"""
# get the snapshot from the snapshot number
snapshot = self.parameters.snapshot_directories_list[snapshot_number]
-
+
if self.parameters.use_lazy_loading:
# This fails if an incorrect snapshot was loaded.
if self.test_data_sets[0].currently_loaded_file != snapshot_number:
- raise Exception("Cannot calculate gradients, wrong file "
- "was lazily loaded.")
+ raise Exception(
+ "Cannot calculate gradients, wrong file "
+ "was lazily loaded."
+ )
return self.test_data_sets[0].input_data.grad
else:
- return self.test_data_inputs.\
- grad[snapshot.grid_size*snapshot_number:
- snapshot.grid_size*(snapshot_number+1)]
+ return self.test_data_inputs.grad[
+ snapshot.grid_size
+ * snapshot_number : snapshot.grid_size
+ * (snapshot_number + 1)
+ ]
[docs] def get_snapshot_calculation_output(self, snapshot_number):
"""
@@ -352,14 +371,16 @@ Source code for mala.datahandling.data_handler
Path to the calculation output for this snapshot.
"""
- return self.parameters.snapshot_directories_list[snapshot_number].\
- calculation_output
+ return self.parameters.snapshot_directories_list[
+ snapshot_number
+ ].calculation_output
# Debugging
######################
-
-[docs] def raw_numpy_to_converted_scaled_tensor(self, numpy_array, data_type,
- units, convert3Dto1D=False):
+
+[docs] def raw_numpy_to_converted_scaled_tensor(
+ self, numpy_array, data_type, units, convert3Dto1D=False
+ ):
"""
Transform a raw numpy array into a scaled torch tensor.
@@ -386,12 +407,14 @@ Source code for mala.datahandling.data_handler
"""
# Check parameters for consistency.
if data_type != "in" and data_type != "out":
- raise Exception("Please specify either \"in\" or \"out\" as "
- "data_type.")
+ raise Exception(
+ 'Please specify either "in" or "out" as ' "data_type."
+ )
# Convert units of numpy array.
- numpy_array = self.__raw_numpy_to_converted_numpy(numpy_array,
- data_type, units)
+ numpy_array = self.__raw_numpy_to_converted_numpy(
+ numpy_array, data_type, units
+ )
# If desired, the dimensions can be changed.
if convert3Dto1D:
@@ -405,16 +428,17 @@
Source code for mala.datahandling.data_handler
desired_dimensions = None
# Convert numpy array to scaled tensor a network can work with.
- numpy_array = self.\
- __converted_numpy_to_scaled_tensor(numpy_array, desired_dimensions,
- data_type)
+ numpy_array = self.__converted_numpy_to_scaled_tensor(
+ numpy_array, desired_dimensions, data_type
+ )
return numpy_array
-[docs] def resize_snapshots_for_debugging(self, directory="./",
- naming_scheme_input=
- "test_Al_debug_2k_nr*.in",
- naming_scheme_output=
- "test_Al_debug_2k_nr*.out"):
+[docs] def resize_snapshots_for_debugging(
+ self,
+ directory="./",
+ naming_scheme_input="test_Al_debug_2k_nr*.in",
+ naming_scheme_output="test_Al_debug_2k_nr*.out",
+ ):
"""
Resize all snapshots in the list.
@@ -433,18 +457,22 @@ Source code for mala.datahandling.data_handler
i = 0
snapshot: Snapshot
for snapshot in self.parameters.snapshot_directories_list:
- tmp_array = self.descriptor_calculator.\
- read_from_numpy_file(os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file),
- units=snapshot.input_units)
+ tmp_array = self.descriptor_calculator.read_from_numpy_file(
+ os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ ),
+ units=snapshot.input_units,
+ )
tmp_file_name = naming_scheme_input
tmp_file_name = tmp_file_name.replace("*", str(i))
np.save(os.path.join(directory, tmp_file_name) + ".npy", tmp_array)
- tmp_array = self.target_calculator.\
- read_from_numpy_file(os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file),
- units=snapshot.output_units)
+ tmp_array = self.target_calculator.read_from_numpy_file(
+ os.path.join(
+ snapshot.output_npy_directory, snapshot.output_npy_file
+ ),
+ units=snapshot.output_units,
+ )
tmp_file_name = naming_scheme_output
tmp_file_name = tmp_file_name.replace("*", str(i))
np.save(os.path.join(directory, tmp_file_name + ".npy"), tmp_array)
@@ -478,29 +506,36 @@
Source code for mala.datahandling.data_handler
self.nr_validation_snapshots += 1
self.nr_validation_data += snapshot.grid_size
else:
- raise Exception("Unknown option for snapshot splitting "
- "selected.")
+ raise Exception(
+ "Unknown option for snapshot splitting selected."
+ )
# Now we need to check whether or not this input is believable.
nr_of_snapshots = len(self.parameters.snapshot_directories_list)
- if nr_of_snapshots != (self.nr_training_snapshots +
- self.nr_test_snapshots +
- self.nr_validation_snapshots):
- raise Exception("Cannot split snapshots with specified "
- "splitting scheme, "
- "too few or too many options selected")
+ if nr_of_snapshots != (
+ self.nr_training_snapshots
+ + self.nr_test_snapshots
+ + self.nr_validation_snapshots
+ ):
+ raise Exception(
+ "Cannot split snapshots with specified "
+ "splitting scheme, "
+ "too few or too many options selected"
+ )
# MALA can either be run in training or test-only mode.
# But it has to be run in either of those!
# So either training AND validation snapshots can be provided
# OR only test snapshots.
if self.nr_test_snapshots != 0:
if self.nr_training_snapshots == 0:
- printout("DataHandler prepared for inference. No training "
- "possible with this setup. If this is not what "
- "you wanted, please revise the input script. "
- "Validation snapshots you may have entered will"
- "be ignored.",
- min_verbosity=0)
+ printout(
+ "DataHandler prepared for inference. No training "
+ "possible with this setup. If this is not what "
+ "you wanted, please revise the input script. "
+ "Validation snapshots you may have entered will"
+ "be ignored.",
+ min_verbosity=0,
+ )
else:
if self.nr_training_snapshots == 0:
raise Exception("No training snapshots provided.")
@@ -510,38 +545,44 @@
Source code for mala.datahandling.data_handler
raise Exception("Wrong parameter for data splitting provided.")
if not self.parameters.use_lazy_loading:
- self.__allocate_arrays()
+ self.__allocate_arrays()
# Reordering the lists.
- snapshot_order = {'tr': 0, 'va': 1, 'te': 2}
- self.parameters.snapshot_directories_list.sort(key=lambda d:
- snapshot_order
- [d.snapshot_function])
+ snapshot_order = {"tr": 0, "va": 1, "te": 2}
+ self.parameters.snapshot_directories_list.sort(
+ key=lambda d: snapshot_order[d.snapshot_function]
+ )
def __allocate_arrays(self):
if self.nr_training_data > 0:
- self.training_data_inputs = np.zeros((self.nr_training_data,
- self.input_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
- self.training_data_outputs = np.zeros((self.nr_training_data,
- self.output_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
+ self.training_data_inputs = np.zeros(
+ (self.nr_training_data, self.input_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
+ self.training_data_outputs = np.zeros(
+ (self.nr_training_data, self.output_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
if self.nr_validation_data > 0:
- self.validation_data_inputs = np.zeros((self.nr_validation_data,
- self.input_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
- self.validation_data_outputs = np.zeros((self.nr_validation_data,
- self.output_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
+ self.validation_data_inputs = np.zeros(
+ (self.nr_validation_data, self.input_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
+ self.validation_data_outputs = np.zeros(
+ (self.nr_validation_data, self.output_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
if self.nr_test_data > 0:
- self.test_data_inputs = np.zeros((self.nr_test_data,
- self.input_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
- self.test_data_outputs = np.zeros((self.nr_test_data,
- self.output_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
+ self.test_data_inputs = np.zeros(
+ (self.nr_test_data, self.input_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
+ self.test_data_outputs = np.zeros(
+ (self.nr_test_data, self.output_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
def __load_data(self, function, data_type):
"""
@@ -556,21 +597,27 @@
Source code for mala.datahandling.data_handler
data_type : string
Can be "input" or "output".
"""
- if function != "training" and function != "test" and \
- function != "validation":
+ if (
+ function != "training"
+ and function != "test"
+ and function != "validation"
+ ):
raise Exception("Unknown snapshot type detected.")
if data_type != "outputs" and data_type != "inputs":
raise Exception("Unknown data type detected.")
# Extracting all the information pertaining to the data set.
- array = function+"_data_"+data_type
+ array = function + "_data_" + data_type
if data_type == "inputs":
calculator = self.descriptor_calculator
else:
calculator = self.target_calculator
- feature_dimension = self.input_dimension if data_type == "inputs" \
+ feature_dimension = (
+ self.input_dimension
+ if data_type == "inputs"
else self.output_dimension
+ )
snapshot_counter = 0
gs_old = 0
@@ -581,25 +628,32 @@
Source code for mala.datahandling.data_handler
# Data scaling is only performed on the training data sets.
if snapshot.snapshot_function == function[0:2]:
if data_type == "inputs":
- file = os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file)
+ file = os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ )
units = snapshot.input_units
else:
- file = os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file)
+ file = os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ )
units = snapshot.output_units
if snapshot.snapshot_type == "numpy":
calculator.read_from_numpy_file(
file,
units=units,
- array=getattr(self, array)[gs_old : gs_old + gs_new, :],
+ array=getattr(self, array)[
+ gs_old : gs_old + gs_new, :
+ ],
reshape=True,
)
elif snapshot.snapshot_type == "openpmd":
- getattr(self, array)[gs_old : gs_old + gs_new] = \
- calculator.read_from_openpmd_file(file, units=units) \
- .reshape([gs_new, feature_dimension])
+ getattr(self, array)[gs_old : gs_old + gs_new] = (
+ calculator.read_from_openpmd_file(
+ file, units=units
+ ).reshape([gs_new, feature_dimension])
+ )
else:
raise Exception("Unknown snapshot file type.")
snapshot_counter += 1
@@ -615,61 +669,91 @@
Source code for mala.datahandling.data_handler
# all ears.
if data_type == "inputs":
if function == "training":
- self.training_data_inputs = torch.\
- from_numpy(self.training_data_inputs).float()
+ self.training_data_inputs = torch.from_numpy(
+ self.training_data_inputs
+ ).float()
if function == "validation":
- self.validation_data_inputs = torch.\
- from_numpy(self.validation_data_inputs).float()
+ self.validation_data_inputs = torch.from_numpy(
+ self.validation_data_inputs
+ ).float()
if function == "test":
- self.test_data_inputs = torch.\
- from_numpy(self.test_data_inputs).float()
+ self.test_data_inputs = torch.from_numpy(
+ self.test_data_inputs
+ ).float()
if data_type == "outputs":
if function == "training":
- self.training_data_outputs = torch.\
- from_numpy(self.training_data_outputs).float()
+ self.training_data_outputs = torch.from_numpy(
+ self.training_data_outputs
+ ).float()
if function == "validation":
- self.validation_data_outputs = torch.\
- from_numpy(self.validation_data_outputs).float()
+ self.validation_data_outputs = torch.from_numpy(
+ self.validation_data_outputs
+ ).float()
if function == "test":
- self.test_data_outputs = torch.\
- from_numpy(self.test_data_outputs).float()
-
+ self.test_data_outputs = torch.from_numpy(
+ self.test_data_outputs
+ ).float()
+
def __build_datasets(self):
"""Build the DataSets that are used during training."""
- if self.parameters.use_lazy_loading and not self.parameters.use_lazy_loading_prefetch:
+ if (
+ self.parameters.use_lazy_loading
+ and not self.parameters.use_lazy_loading_prefetch
+ ):
# Create the lazy loading data sets.
- self.training_data_sets.append(LazyLoadDataset(
- self.input_dimension, self.output_dimension,
- self.input_data_scaler, self.output_data_scaler,
- self.descriptor_calculator, self.target_calculator,
- self.use_horovod))
- self.validation_data_sets.append(LazyLoadDataset(
- self.input_dimension, self.output_dimension,
- self.input_data_scaler, self.output_data_scaler,
- self.descriptor_calculator, self.target_calculator,
- self.use_horovod))
-
- if self.nr_test_data != 0:
- self.test_data_sets.append(LazyLoadDataset(
+ self.training_data_sets.append(
+ LazyLoadDataset(
+ self.input_dimension,
+ self.output_dimension,
+ self.input_data_scaler,
+ self.output_data_scaler,
+ self.descriptor_calculator,
+ self.target_calculator,
+ self.use_horovod,
+ )
+ )
+ self.validation_data_sets.append(
+ LazyLoadDataset(
self.input_dimension,
self.output_dimension,
- self.input_data_scaler, self.output_data_scaler,
- self.descriptor_calculator, self.target_calculator,
+ self.input_data_scaler,
+ self.output_data_scaler,
+ self.descriptor_calculator,
+ self.target_calculator,
self.use_horovod,
- input_requires_grad=True))
+ )
+ )
+
+ if self.nr_test_data != 0:
+ self.test_data_sets.append(
+ LazyLoadDataset(
+ self.input_dimension,
+ self.output_dimension,
+ self.input_data_scaler,
+ self.output_data_scaler,
+ self.descriptor_calculator,
+ self.target_calculator,
+ self.use_horovod,
+ input_requires_grad=True,
+ )
+ )
# Add snapshots to the lazy loading data sets.
for snapshot in self.parameters.snapshot_directories_list:
if snapshot.snapshot_function == "tr":
- self.training_data_sets[0].add_snapshot_to_dataset(snapshot)
+ self.training_data_sets[0].add_snapshot_to_dataset(
+ snapshot
+ )
if snapshot.snapshot_function == "va":
- self.validation_data_sets[0].add_snapshot_to_dataset(snapshot)
+ self.validation_data_sets[0].add_snapshot_to_dataset(
+ snapshot
+ )
if snapshot.snapshot_function == "te":
self.test_data_sets[0].add_snapshot_to_dataset(snapshot)
@@ -679,33 +763,57 @@
Source code for mala.datahandling.data_handler
# self.training_data_set.mix_datasets()
# self.validation_data_set.mix_datasets()
# self.test_data_set.mix_datasets()
- elif self.parameters.use_lazy_loading and self.parameters.use_lazy_loading_prefetch:
+ elif (
+ self.parameters.use_lazy_loading
+ and self.parameters.use_lazy_loading_prefetch
+ ):
printout("Using lazy loading pre-fetching.", min_verbosity=2)
# Create LazyLoadDatasetSingle instances per snapshot and add to
# list.
for snapshot in self.parameters.snapshot_directories_list:
if snapshot.snapshot_function == "tr":
- self.training_data_sets.append(LazyLoadDatasetSingle(
- self.mini_batch_size, snapshot,
- self.input_dimension, self.output_dimension,
- self.input_data_scaler, self.output_data_scaler,
- self.descriptor_calculator, self.target_calculator,
- self.use_horovod))
+ self.training_data_sets.append(
+ LazyLoadDatasetSingle(
+ self.mini_batch_size,
+ snapshot,
+ self.input_dimension,
+ self.output_dimension,
+ self.input_data_scaler,
+ self.output_data_scaler,
+ self.descriptor_calculator,
+ self.target_calculator,
+ self.use_horovod,
+ )
+ )
if snapshot.snapshot_function == "va":
- self.validation_data_sets.append(LazyLoadDatasetSingle(
- self.mini_batch_size, snapshot,
- self.input_dimension, self.output_dimension,
- self.input_data_scaler, self.output_data_scaler,
- self.descriptor_calculator, self.target_calculator,
- self.use_horovod))
+ self.validation_data_sets.append(
+ LazyLoadDatasetSingle(
+ self.mini_batch_size,
+ snapshot,
+ self.input_dimension,
+ self.output_dimension,
+ self.input_data_scaler,
+ self.output_data_scaler,
+ self.descriptor_calculator,
+ self.target_calculator,
+ self.use_horovod,
+ )
+ )
if snapshot.snapshot_function == "te":
- self.test_data_sets.append(LazyLoadDatasetSingle(
- self.mini_batch_size, snapshot,
- self.input_dimension, self.output_dimension,
- self.input_data_scaler, self.output_data_scaler,
- self.descriptor_calculator, self.target_calculator,
- self.use_horovod,
- input_requires_grad=True))
+ self.test_data_sets.append(
+ LazyLoadDatasetSingle(
+ self.mini_batch_size,
+ snapshot,
+ self.input_dimension,
+ self.output_dimension,
+ self.input_data_scaler,
+ self.output_data_scaler,
+ self.descriptor_calculator,
+ self.target_calculator,
+ self.use_horovod,
+ input_requires_grad=True,
+ )
+ )
else:
if self.nr_training_data != 0:
@@ -713,14 +821,20 @@
Source code for mala.datahandling.data_handler
self.output_data_scaler.transform(self.training_data_outputs)
if self.parameters.use_fast_tensor_data_set:
printout("Using FastTensorDataset.", min_verbosity=2)
- self.training_data_sets.append( \
- FastTensorDataset(self.mini_batch_size,
- self.training_data_inputs,
- self.training_data_outputs))
+ self.training_data_sets.append(
+ FastTensorDataset(
+ self.mini_batch_size,
+ self.training_data_inputs,
+ self.training_data_outputs,
+ )
+ )
else:
- self.training_data_sets.append( \
- TensorDataset(self.training_data_inputs,
- self.training_data_outputs))
+ self.training_data_sets.append(
+ TensorDataset(
+ self.training_data_inputs,
+ self.training_data_outputs,
+ )
+ )
if self.nr_validation_data != 0:
self.__load_data("validation", "inputs")
@@ -730,14 +844,20 @@
Source code for mala.datahandling.data_handler
self.output_data_scaler.transform(self.validation_data_outputs)
if self.parameters.use_fast_tensor_data_set:
printout("Using FastTensorDataset.", min_verbosity=2)
- self.validation_data_sets.append( \
- FastTensorDataset(self.mini_batch_size,
- self.validation_data_inputs,
- self.validation_data_outputs))
+ self.validation_data_sets.append(
+ FastTensorDataset(
+ self.mini_batch_size,
+ self.validation_data_inputs,
+ self.validation_data_outputs,
+ )
+ )
else:
- self.validation_data_sets.append( \
- TensorDataset(self.validation_data_inputs,
- self.validation_data_outputs))
+ self.validation_data_sets.append(
+ TensorDataset(
+ self.validation_data_inputs,
+ self.validation_data_outputs,
+ )
+ )
if self.nr_test_data != 0:
self.__load_data("test", "inputs")
@@ -746,9 +866,11 @@
Source code for mala.datahandling.data_handler
self.__load_data("test", "outputs")
self.output_data_scaler.transform(self.test_data_outputs)
- self.test_data_sets.append( \
- TensorDataset(self.test_data_inputs,
- self.test_data_outputs))
+ self.test_data_sets.append(
+ TensorDataset(
+ self.test_data_inputs, self.test_data_outputs
+ )
+ )
# Scaling
######################
@@ -773,14 +895,22 @@
Source code for mala.datahandling.data_handler
# Data scaling is only performed on the training data sets.
if snapshot.snapshot_function == "tr":
if snapshot.snapshot_type == "numpy":
- tmp = self.descriptor_calculator. \
- read_from_numpy_file(os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file),
- units=snapshot.input_units)
+ tmp = self.descriptor_calculator.read_from_numpy_file(
+ os.path.join(
+ snapshot.input_npy_directory,
+ snapshot.input_npy_file,
+ ),
+ units=snapshot.input_units,
+ )
elif snapshot.snapshot_type == "openpmd":
- tmp = self.descriptor_calculator. \
- read_from_openpmd_file(os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file))
+ tmp = (
+ self.descriptor_calculator.read_from_openpmd_file(
+ os.path.join(
+ snapshot.input_npy_directory,
+ snapshot.input_npy_file,
+ )
+ )
+ )
else:
raise Exception("Unknown snapshot file type.")
@@ -792,8 +922,9 @@
Source code for mala.datahandling.data_handler
tmp = np.array(tmp)
if tmp.dtype != DEFAULT_NP_DATA_DTYPE:
tmp = tmp.astype(DEFAULT_NP_DATA_DTYPE)
- tmp = tmp.reshape([snapshot.grid_size,
- self.input_dimension])
+ tmp = tmp.reshape(
+ [snapshot.grid_size, self.input_dimension]
+ )
tmp = torch.from_numpy(tmp).float()
self.input_data_scaler.incremental_fit(tmp)
@@ -825,14 +956,20 @@
Source code for mala.datahandling.data_handler
# Data scaling is only performed on the training data sets.
if snapshot.snapshot_function == "tr":
if snapshot.snapshot_type == "numpy":
- tmp = self.target_calculator.\
- read_from_numpy_file(os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file),
- units=snapshot.output_units)
+ tmp = self.target_calculator.read_from_numpy_file(
+ os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ ),
+ units=snapshot.output_units,
+ )
elif snapshot.snapshot_type == "openpmd":
- tmp = self.target_calculator. \
- read_from_openpmd_file(os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file))
+ tmp = self.target_calculator.read_from_openpmd_file(
+ os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ )
+ )
else:
raise Exception("Unknown snapshot file type.")
@@ -844,8 +981,9 @@
Source code for mala.datahandling.data_handler
tmp = np.array(tmp)
if tmp.dtype != DEFAULT_NP_DATA_DTYPE:
tmp = tmp.astype(DEFAULT_NP_DATA_DTYPE)
- tmp = tmp.reshape([snapshot.grid_size,
- self.output_dimension])
+ tmp = tmp.reshape(
+ [snapshot.grid_size, self.output_dimension]
+ )
tmp = torch.from_numpy(tmp).float()
self.output_data_scaler.incremental_fit(tmp)
i += 1
@@ -855,30 +993,35 @@
Source code for mala.datahandling.data_handler
self.__load_data("training", "outputs")
self.output_data_scaler.fit(self.training_data_outputs)
- printout("Output scaler parametrized.", min_verbosity=1)
+ printout("Output scaler parametrized.", min_verbosity=1)
- def __raw_numpy_to_converted_numpy(self, numpy_array, data_type="in",
- units=None):
+ def __raw_numpy_to_converted_numpy(
+ self, numpy_array, data_type="in", units=None
+ ):
"""Convert a raw numpy array containing into the correct units."""
if data_type == "in":
- if data_type == "in" and self.descriptor_calculator.\
- descriptors_contain_xyz:
+ if (
+ data_type == "in"
+ and self.descriptor_calculator.descriptors_contain_xyz
+ ):
numpy_array = numpy_array[:, :, :, 3:]
if units is not None:
- numpy_array *= self.descriptor_calculator.convert_units(1,
- units)
+ numpy_array *= self.descriptor_calculator.convert_units(
+ 1, units
+ )
return numpy_array
elif data_type == "out":
if units is not None:
numpy_array *= self.target_calculator.convert_units(1, units)
return numpy_array
else:
- raise Exception("Please choose either \"in\" or \"out\" for "
- "this function.")
+ raise Exception(
+ 'Please choose either "in" or "out" for ' "this function."
+ )
- def __converted_numpy_to_scaled_tensor(self, numpy_array,
- desired_dimensions=None,
- data_type="in"):
+ def __converted_numpy_to_scaled_tensor(
+ self, numpy_array, desired_dimensions=None, data_type="in"
+ ):
"""
Transform a numpy array containing into a scaled torch tensor.
@@ -894,8 +1037,9 @@
Source code for mala.datahandling.data_handler
elif data_type == "out":
self.output_data_scaler.transform(numpy_array)
else:
- raise Exception("Please choose either \"in\" or \"out\" for "
- "this function.")
+ raise Exception(
+ 'Please choose either "in" or "out" for ' "this function."
+ )
return numpy_array
diff --git a/_modules/mala/datahandling/data_handler_base.html b/_modules/mala/datahandling/data_handler_base.html
index 26c8ff0cd..78641f5c2 100644
--- a/_modules/mala/datahandling/data_handler_base.html
+++ b/_modules/mala/datahandling/data_handler_base.html
@@ -75,6 +75,7 @@
Source code for mala.datahandling.data_handler_base
"""Base class for all data handling (loading, shuffling, etc.)."""
+
from abc import ABC
import os
@@ -105,8 +106,12 @@ Source code for mala.datahandling.data_handler_base
be created by this class.
"""
- def __init__(self, parameters: Parameters, target_calculator=None,
- descriptor_calculator=None):
+ def __init__(
+ self,
+ parameters: Parameters,
+ target_calculator=None,
+ descriptor_calculator=None,
+ ):
self.parameters: ParametersData = parameters.data
self.use_horovod = parameters.use_horovod
@@ -152,11 +157,18 @@ Source code for mala.datahandling.data_handler_base
# Adding/Deleting data
########################
-[docs] def add_snapshot(self, input_file, input_directory,
- output_file, output_directory,
- add_snapshot_as,
- output_units="1/(eV*A^3)", input_units="None",
- calculation_output_file="", snapshot_type="numpy"):
+[docs] def add_snapshot(
+ self,
+ input_file,
+ input_directory,
+ output_file,
+ output_directory,
+ add_snapshot_as,
+ output_units="1/(eV*A^3)",
+ input_units="None",
+ calculation_output_file="",
+ snapshot_type="numpy",
+ ):
"""
Add a snapshot to the data pipeline.
@@ -195,13 +207,17 @@ Source code for mala.datahandling.data_handler_base
Either "numpy" or "openpmd" based on what kind of files you
want to operate on.
"""
- snapshot = Snapshot(input_file, input_directory,
- output_file, output_directory,
- add_snapshot_as,
- input_units=input_units,
- output_units=output_units,
- calculation_output=calculation_output_file,
- snapshot_type=snapshot_type)
+ snapshot = Snapshot(
+ input_file,
+ input_directory,
+ output_file,
+ output_directory,
+ add_snapshot_as,
+ input_units=input_units,
+ output_units=output_units,
+ calculation_output=calculation_output_file,
+ snapshot_type=snapshot_type,
+ )
self.parameters.snapshot_directories_list.append(snapshot)
[docs] def clear_data(self):
@@ -230,18 +246,29 @@ Source code for mala.datahandling.data_handler_base
# Descriptors.
####################
- printout("Checking descriptor file ", snapshot.input_npy_file,
- "at", snapshot.input_npy_directory, min_verbosity=1)
+ printout(
+ "Checking descriptor file ",
+ snapshot.input_npy_file,
+ "at",
+ snapshot.input_npy_directory,
+ min_verbosity=1,
+ )
if snapshot.snapshot_type == "numpy":
- tmp_dimension = self.descriptor_calculator. \
- read_dimensions_from_numpy_file(
- os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file))
+ tmp_dimension = (
+ self.descriptor_calculator.read_dimensions_from_numpy_file(
+ os.path.join(
+ snapshot.input_npy_directory,
+ snapshot.input_npy_file,
+ )
+ )
+ )
elif snapshot.snapshot_type == "openpmd":
- tmp_dimension = self.descriptor_calculator. \
- read_dimensions_from_openpmd_file(
- os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file), comm=comm)
+ tmp_dimension = self.descriptor_calculator.read_dimensions_from_openpmd_file(
+ os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ ),
+ comm=comm,
+ )
else:
raise Exception("Unknown snapshot file type.")
@@ -255,24 +282,40 @@ Source code for mala.datahandling.data_handler_base
self.input_dimension = tmp_input_dimension
else:
if self.input_dimension != tmp_input_dimension:
- raise Exception("Invalid snapshot entered at ", snapshot.
- input_npy_file)
+ raise Exception(
+ "Invalid snapshot entered at ",
+ snapshot.input_npy_file,
+ )
####################
# Targets.
####################
- printout("Checking targets file ", snapshot.output_npy_file, "at",
- snapshot.output_npy_directory, min_verbosity=1)
+ printout(
+ "Checking targets file ",
+ snapshot.output_npy_file,
+ "at",
+ snapshot.output_npy_directory,
+ min_verbosity=1,
+ )
if snapshot.snapshot_type == "numpy":
- tmp_dimension = self.target_calculator. \
- read_dimensions_from_numpy_file(
- os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file))
+ tmp_dimension = (
+ self.target_calculator.read_dimensions_from_numpy_file(
+ os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ )
+ )
+ )
elif snapshot.snapshot_type == "openpmd":
- tmp_dimension = self.target_calculator. \
- read_dimensions_from_openpmd_file(
- os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file), comm=comm)
+ tmp_dimension = (
+ self.target_calculator.read_dimensions_from_openpmd_file(
+ os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ ),
+ comm=comm,
+ )
+ )
else:
raise Exception("Unknown snapshot file type.")
@@ -283,8 +326,10 @@ Source code for mala.datahandling.data_handler_base
self.output_dimension = tmp_output_dimension
else:
if self.output_dimension != tmp_output_dimension:
- raise Exception("Invalid snapshot entered at ", snapshot.
- output_npy_file)
+ raise Exception(
+ "Invalid snapshot entered at ",
+ snapshot.output_npy_file,
+ )
if np.prod(tmp_dimension[0:3]) != snapshot.grid_size:
raise Exception("Inconsistent snapshot data provided.")
diff --git a/_modules/mala/datahandling/data_scaler.html b/_modules/mala/datahandling/data_scaler.html
index c5b88eb2b..424520e4c 100644
--- a/_modules/mala/datahandling/data_scaler.html
+++ b/_modules/mala/datahandling/data_scaler.html
@@ -75,6 +75,7 @@
Source code for mala.datahandling.data_scaler
"""DataScaler class for scaling DFT data."""
+
import pickle
try:
@@ -129,8 +130,8 @@ Source code for mala.datahandling.data_scaler
self.mins = torch.empty(0)
self.total_mean = torch.tensor(0)
self.total_std = torch.tensor(0)
- self.total_max = torch.tensor(float('-inf'))
- self.total_min = torch.tensor(float('inf'))
+ self.total_max = torch.tensor(float("-inf"))
+ self.total_min = torch.tensor(float("inf"))
self.total_data_count = 0
@@ -193,24 +194,29 @@ Source code for mala.datahandling.data_scaler
old_std = self.stds
if list(self.means.size())[0] > 0:
- self.means = \
- self.total_data_count /\
- (self.total_data_count + current_data_count) \
- * old_mean + current_data_count / \
- (self.total_data_count + current_data_count)\
+ self.means = (
+ self.total_data_count
+ / (self.total_data_count + current_data_count)
+ * old_mean
+ + current_data_count
+ / (self.total_data_count + current_data_count)
* new_mean
+ )
else:
self.means = new_mean
if list(self.stds.size())[0] > 0:
- self.stds = \
- self.total_data_count / \
- (self.total_data_count + current_data_count) \
- * old_std ** 2 + current_data_count / \
- (self.total_data_count + current_data_count) *\
- new_std ** 2 + \
- (self.total_data_count * current_data_count)\
- / (self.total_data_count + current_data_count)\
- ** 2 * (old_mean - new_mean) ** 2
+ self.stds = (
+ self.total_data_count
+ / (self.total_data_count + current_data_count)
+ * old_std**2
+ + current_data_count
+ / (self.total_data_count + current_data_count)
+ * new_std**2
+ + (self.total_data_count * current_data_count)
+ / (self.total_data_count + current_data_count)
+ ** 2
+ * (old_mean - new_mean) ** 2
+ )
self.stds = torch.sqrt(self.stds)
else:
@@ -241,8 +247,9 @@ Source code for mala.datahandling.data_scaler
##########################
if self.scale_standard:
- current_data_count = list(unscaled.size())[0]\
- * list(unscaled.size())[1]
+ current_data_count = (
+ list(unscaled.size())[0] * list(unscaled.size())[1]
+ )
new_mean = torch.mean(unscaled)
new_std = torch.std(unscaled)
@@ -250,28 +257,31 @@ Source code for mala.datahandling.data_scaler
old_mean = self.total_mean
old_std = self.total_std
- self.total_mean = \
- self.total_data_count / \
- (self.total_data_count + current_data_count) * \
- old_mean + current_data_count / \
- (self.total_data_count + current_data_count) *\
- new_mean
+ self.total_mean = (
+ self.total_data_count
+ / (self.total_data_count + current_data_count)
+ * old_mean
+ + current_data_count
+ / (self.total_data_count + current_data_count)
+ * new_mean
+ )
# This equation is taken from the Sandia code. It
# presumably works, but it gets slighly different
# results.
# Maybe we should check it at some point .
# I think it is merely an issue of numerical accuracy.
- self.total_std = \
- self.total_data_count / \
- (self.total_data_count + current_data_count) * \
- old_std ** 2 + \
- current_data_count / \
- (self.total_data_count + current_data_count) \
- * new_std ** 2 + \
- (self.total_data_count * current_data_count) / \
- (self.total_data_count + current_data_count) \
- ** 2 * (old_mean - new_mean) ** 2
+ self.total_std = (
+ self.total_data_count
+ / (self.total_data_count + current_data_count)
+ * old_std**2
+ + current_data_count
+ / (self.total_data_count + current_data_count)
+ * new_std**2
+ + (self.total_data_count * current_data_count)
+ / (self.total_data_count + current_data_count) ** 2
+ * (old_mean - new_mean) ** 2
+ )
self.total_std = torch.sqrt(self.total_std)
self.total_data_count += current_data_count
@@ -359,8 +369,10 @@ Source code for mala.datahandling.data_scaler
pass
elif self.cantransform is False:
- raise Exception("Transformation cannot be done, this DataScaler "
- "was never initialized")
+ raise Exception(
+ "Transformation cannot be done, this DataScaler "
+ "was never initialized"
+ )
# Perform the actual scaling, but use no_grad to make sure
# that the next couple of iterations stay untracked.
@@ -377,7 +389,7 @@ Source code for mala.datahandling.data_scaler
if self.scale_normal:
unscaled -= self.mins
- unscaled /= (self.maxs - self.mins)
+ unscaled /= self.maxs - self.mins
else:
@@ -391,7 +403,7 @@ Source code for mala.datahandling.data_scaler
if self.scale_normal:
unscaled -= self.total_min
- unscaled /= (self.total_max - self.total_min)
+ unscaled /= self.total_max - self.total_min
[docs] def inverse_transform(self, scaled, as_numpy=False):
"""
@@ -420,8 +432,10 @@ Source code for mala.datahandling.data_scaler
else:
if self.cantransform is False:
- raise Exception("Backtransformation cannot be done, this "
- "DataScaler was never initialized")
+ raise Exception(
+ "Backtransformation cannot be done, this "
+ "DataScaler was never initialized"
+ )
# Perform the actual scaling, but use no_grad to make sure
# that the next couple of iterations stay untracked.
@@ -436,8 +450,9 @@ Source code for mala.datahandling.data_scaler
unscaled = (scaled * self.stds) + self.means
if self.scale_normal:
- unscaled = (scaled*(self.maxs
- - self.mins)) + self.mins
+ unscaled = (
+ scaled * (self.maxs - self.mins)
+ ) + self.mins
else:
@@ -449,9 +464,10 @@ Source code for mala.datahandling.data_scaler
unscaled = (scaled * self.total_std) + self.total_mean
if self.scale_normal:
- unscaled = (scaled*(self.total_max
- - self.total_min)) + self.total_min
-#
+ unscaled = (
+ scaled * (self.total_max - self.total_min)
+ ) + self.total_min
+ #
if as_numpy:
return unscaled.detach().numpy().astype(np.float64)
else:
@@ -474,7 +490,7 @@ Source code for mala.datahandling.data_scaler
if hvd.rank() != 0:
return
if save_format == "pickle":
- with open(filename, 'wb') as handle:
+ with open(filename, "wb") as handle:
pickle.dump(self, handle, protocol=4)
else:
raise Exception("Unsupported parameter save format.")
@@ -499,7 +515,7 @@ Source code for mala.datahandling.data_scaler
"""
if save_format == "pickle":
if isinstance(file, str):
- loaded_scaler = pickle.load(open(file, 'rb'))
+ loaded_scaler = pickle.load(open(file, "rb"))
else:
loaded_scaler = pickle.load(file)
else:
diff --git a/_modules/mala/datahandling/data_shuffler.html b/_modules/mala/datahandling/data_shuffler.html
index 52c7075b5..87ee629e6 100644
--- a/_modules/mala/datahandling/data_shuffler.html
+++ b/_modules/mala/datahandling/data_shuffler.html
@@ -75,12 +75,15 @@
Source code for mala.datahandling.data_shuffler
"""Mixes data between snapshots for improved lazy-loading training."""
+
import os
import numpy as np
-import mala
-from mala.common.parameters import ParametersData, Parameters, DEFAULT_NP_DATA_DTYPE
+from mala.common.parameters import (
+ Parameters,
+ DEFAULT_NP_DATA_DTYPE,
+)
from mala.common.parallelizer import printout
from mala.common.physical_data import PhysicalData
from mala.datahandling.data_handler_base import DataHandlerBase
@@ -107,21 +110,34 @@ Source code for mala.datahandling.data_shuffler
<
be created by this class.
"""
- def __init__(self, parameters: Parameters, target_calculator=None,
- descriptor_calculator=None):
- super(DataShuffler, self).__init__(parameters,
- target_calculator=target_calculator,
- descriptor_calculator=
- descriptor_calculator)
+ def __init__(
+ self,
+ parameters: Parameters,
+ target_calculator=None,
+ descriptor_calculator=None,
+ ):
+ super(DataShuffler, self).__init__(
+ parameters,
+ target_calculator=target_calculator,
+ descriptor_calculator=descriptor_calculator,
+ )
if self.descriptor_calculator.parameters.descriptors_contain_xyz:
- printout("Disabling XYZ-cutting from descriptor data for "
- "shuffling. If needed, please re-enable afterwards.")
- self.descriptor_calculator.parameters.descriptors_contain_xyz = \
+ printout(
+ "Disabling XYZ-cutting from descriptor data for "
+ "shuffling. If needed, please re-enable afterwards."
+ )
+ self.descriptor_calculator.parameters.descriptors_contain_xyz = (
False
-
-[docs] def add_snapshot(self, input_file, input_directory,
- output_file, output_directory,
- snapshot_type="numpy"):
+ )
+
+[docs] def add_snapshot(
+ self,
+ input_file,
+ input_directory,
+ output_file,
+ output_directory,
+ snapshot_type="numpy",
+ ):
"""
Add a snapshot to the data pipeline.
@@ -143,100 +159,151 @@ Source code for mala.datahandling.data_shuffler
<
Either "numpy" or "openpmd" based on what kind of files you
want to operate on.
"""
- super(DataShuffler, self).\
- add_snapshot(input_file, input_directory,
- output_file, output_directory,
- add_snapshot_as="te",
- output_units="None", input_units="None",
- calculation_output_file="",
- snapshot_type=snapshot_type)
-
- def __shuffle_numpy(self, number_of_new_snapshots, shuffle_dimensions,
- descriptor_save_path, save_name, target_save_path,
- permutations, file_ending):
+ super(DataShuffler, self).add_snapshot(
+ input_file,
+ input_directory,
+ output_file,
+ output_directory,
+ add_snapshot_as="te",
+ output_units="None",
+ input_units="None",
+ calculation_output_file="",
+ snapshot_type=snapshot_type,
+ )
+
+ def __shuffle_numpy(
+ self,
+ number_of_new_snapshots,
+ shuffle_dimensions,
+ descriptor_save_path,
+ save_name,
+ target_save_path,
+ permutations,
+ file_ending,
+ ):
# Load the data (via memmap).
descriptor_data = []
target_data = []
- for idx, snapshot in enumerate(self.parameters.
- snapshot_directories_list):
+ for idx, snapshot in enumerate(
+ self.parameters.snapshot_directories_list
+ ):
# TODO: Use descriptor and target calculator for this.
- descriptor_data.append(np.load(os.path.join(snapshot.
- input_npy_directory,
- snapshot.input_npy_file),
- mmap_mode="r"))
- target_data.append(np.load(os.path.join(snapshot.
- output_npy_directory,
- snapshot.output_npy_file),
- mmap_mode="r"))
+ descriptor_data.append(
+ np.load(
+ os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ ),
+ mmap_mode="r",
+ )
+ )
+ target_data.append(
+ np.load(
+ os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ ),
+ mmap_mode="r",
+ )
+ )
# Do the actual shuffling.
for i in range(0, number_of_new_snapshots):
- new_descriptors = np.zeros((int(np.prod(shuffle_dimensions)),
- self.input_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
- new_targets = np.zeros((int(np.prod(shuffle_dimensions)),
- self.output_dimension),
- dtype=DEFAULT_NP_DATA_DTYPE)
+ new_descriptors = np.zeros(
+ (int(np.prod(shuffle_dimensions)), self.input_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
+ new_targets = np.zeros(
+ (int(np.prod(shuffle_dimensions)), self.output_dimension),
+ dtype=DEFAULT_NP_DATA_DTYPE,
+ )
last_start = 0
- descriptor_name = os.path.join(descriptor_save_path,
- save_name.replace("*", str(i)))
- target_name = os.path.join(target_save_path,
- save_name.replace("*", str(i)))
+ descriptor_name = os.path.join(
+ descriptor_save_path, save_name.replace("*", str(i))
+ )
+ target_name = os.path.join(
+ target_save_path, save_name.replace("*", str(i))
+ )
# Each new snapshot gets an number_of_new_snapshots-th from each
# snapshot.
for j in range(0, self.nr_snapshots):
- current_grid_size = self.parameters.\
- snapshot_directories_list[j].grid_size
- current_chunk = int(current_grid_size /
- number_of_new_snapshots)
- new_descriptors[last_start:current_chunk+last_start] = \
- descriptor_data[j].reshape(current_grid_size,
- self.input_dimension) \
- [i*current_chunk:(i+1)*current_chunk, :]
- new_targets[last_start:current_chunk+last_start] = \
- target_data[j].reshape(current_grid_size,
- self.output_dimension) \
- [i*current_chunk:(i+1)*current_chunk, :]
+ current_grid_size = self.parameters.snapshot_directories_list[
+ j
+ ].grid_size
+ current_chunk = int(
+ current_grid_size / number_of_new_snapshots
+ )
+ new_descriptors[
+ last_start : current_chunk + last_start
+ ] = descriptor_data[j].reshape(
+ current_grid_size, self.input_dimension
+ )[
+ i * current_chunk : (i + 1) * current_chunk, :
+ ]
+ new_targets[
+ last_start : current_chunk + last_start
+ ] = target_data[j].reshape(
+ current_grid_size, self.output_dimension
+ )[
+ i * current_chunk : (i + 1) * current_chunk, :
+ ]
last_start += current_chunk
# Randomize and save to disk.
new_descriptors = new_descriptors[permutations[i]]
new_targets = new_targets[permutations[i]]
- new_descriptors = new_descriptors.reshape([shuffle_dimensions[0],
- shuffle_dimensions[1],
- shuffle_dimensions[2],
- self.input_dimension])
- new_targets = new_targets.reshape([shuffle_dimensions[0],
- shuffle_dimensions[1],
- shuffle_dimensions[2],
- self.output_dimension])
+ new_descriptors = new_descriptors.reshape(
+ [
+ shuffle_dimensions[0],
+ shuffle_dimensions[1],
+ shuffle_dimensions[2],
+ self.input_dimension,
+ ]
+ )
+ new_targets = new_targets.reshape(
+ [
+ shuffle_dimensions[0],
+ shuffle_dimensions[1],
+ shuffle_dimensions[2],
+ self.output_dimension,
+ ]
+ )
if file_ending == "npy":
- self.descriptor_calculator.\
- write_to_numpy_file(descriptor_name+".in.npy",
- new_descriptors)
- self.target_calculator.\
- write_to_numpy_file(target_name+".out.npy",
- new_targets)
+ self.descriptor_calculator.write_to_numpy_file(
+ descriptor_name + ".in.npy", new_descriptors
+ )
+ self.target_calculator.write_to_numpy_file(
+ target_name + ".out.npy", new_targets
+ )
else:
# We check above that in the non-numpy case, OpenPMD will work.
- self.descriptor_calculator.grid_dimensions = \
- list(shuffle_dimensions)
- self.target_calculator.grid_dimensions = \
- list(shuffle_dimensions)
- self.descriptor_calculator.\
- write_to_openpmd_file(descriptor_name+".in."+file_ending,
- new_descriptors,
- additional_attributes={"global_shuffling_seed": self.parameters.shuffling_seed,
- "local_shuffling_seed": i*self.parameters.shuffling_seed},
- internal_iteration_number=i)
- self.target_calculator.\
- write_to_openpmd_file(target_name+".out."+file_ending,
- array=new_targets,
- additional_attributes={"global_shuffling_seed": self.parameters.shuffling_seed,
- "local_shuffling_seed": i*self.parameters.shuffling_seed},
- internal_iteration_number=i)
+ self.descriptor_calculator.grid_dimensions = list(
+ shuffle_dimensions
+ )
+ self.target_calculator.grid_dimensions = list(
+ shuffle_dimensions
+ )
+ self.descriptor_calculator.write_to_openpmd_file(
+ descriptor_name + ".in." + file_ending,
+ new_descriptors,
+ additional_attributes={
+ "global_shuffling_seed": self.parameters.shuffling_seed,
+ "local_shuffling_seed": i
+ * self.parameters.shuffling_seed,
+ },
+ internal_iteration_number=i,
+ )
+ self.target_calculator.write_to_openpmd_file(
+ target_name + ".out." + file_ending,
+ array=new_targets,
+ additional_attributes={
+ "global_shuffling_seed": self.parameters.shuffling_seed,
+ "local_shuffling_seed": i
+ * self.parameters.shuffling_seed,
+ },
+ internal_iteration_number=i,
+ )
# The function __shuffle_openpmd can be used to shuffle descriptor data and
# target data.
@@ -244,8 +311,15 @@ Source code for mala.datahandling.data_shuffler
<
# Use this class to parameterize which of both should be shuffled.
class __DescriptorOrTarget:
- def __init__(self, save_path, npy_directory, npy_file, calculator,
- name_infix, dimension):
+ def __init__(
+ self,
+ save_path,
+ npy_directory,
+ npy_file,
+ calculator,
+ name_infix,
+ dimension,
+ ):
self.save_path = save_path
self.npy_directory = npy_directory
self.npy_file = npy_file
@@ -259,10 +333,15 @@ Source code for mala.datahandling.data_shuffler
<
self.rank = 0
self.size = 1
-
- def __shuffle_openpmd(self, dot: __DescriptorOrTarget,
- number_of_new_snapshots, shuffle_dimensions,
- save_name, permutations, file_ending):
+ def __shuffle_openpmd(
+ self,
+ dot: __DescriptorOrTarget,
+ number_of_new_snapshots,
+ shuffle_dimensions,
+ save_name,
+ permutations,
+ file_ending,
+ ):
import openpmd_api as io
if self.parameters._configuration["mpi"]:
@@ -271,18 +350,21 @@ Source code for mala.datahandling.data_shuffler
<
comm = self.__MockedMPIComm()
import math
+
items_per_process = math.ceil(number_of_new_snapshots / comm.size)
my_items_start = comm.rank * items_per_process
- my_items_end = min((comm.rank + 1) * items_per_process,
- number_of_new_snapshots)
+ my_items_end = min(
+ (comm.rank + 1) * items_per_process, number_of_new_snapshots
+ )
my_items_count = my_items_end - my_items_start
if self.parameters._configuration["mpi"]:
# imagine we have 20 new snapshots to create, but 100 ranks
# it's sufficient to let only the first 20 ranks participate in the
# following code
- num_of_participating_ranks = math.ceil(number_of_new_snapshots /
- items_per_process)
+ num_of_participating_ranks = math.ceil(
+ number_of_new_snapshots / items_per_process
+ )
color = comm.rank < num_of_participating_ranks
comm = comm.Split(color=int(color), key=comm.rank)
if not color:
@@ -291,20 +373,30 @@ Source code for mala.datahandling.data_shuffler
<
# Load the data
input_series_list = []
for idx, snapshot in enumerate(
- self.parameters.snapshot_directories_list):
+ self.parameters.snapshot_directories_list
+ ):
# TODO: Use descriptor and target calculator for this.
if isinstance(comm, self.__MockedMPIComm):
input_series_list.append(
io.Series(
- os.path.join(dot.npy_directory(snapshot),
- dot.npy_file(snapshot)),
- io.Access.read_only))
+ os.path.join(
+ dot.npy_directory(snapshot),
+ dot.npy_file(snapshot),
+ ),
+ io.Access.read_only,
+ )
+ )
else:
input_series_list.append(
io.Series(
- os.path.join(dot.npy_directory(snapshot),
- dot.npy_file(snapshot)),
- io.Access.read_only, comm))
+ os.path.join(
+ dot.npy_directory(snapshot),
+ dot.npy_file(snapshot),
+ ),
+ io.Access.read_only,
+ comm,
+ )
+ )
# Peek into the input snapshots to determine the datatypes.
for series in input_series_list:
@@ -331,8 +423,10 @@ Source code for mala.datahandling.data_shuffler
<
extent_dim_0 = dset[slice_dimension]
if extent_dim_0 % n != 0:
raise Exception(
- "Dataset {} cannot be split into {} chunks on dimension {}."
- .format(dset, n, slice_dimension))
+ "Dataset {} cannot be split into {} chunks on dimension {}.".format(
+ dset, n, slice_dimension
+ )
+ )
single_chunk_len = extent_dim_0 // n
offset[slice_dimension] = i * single_chunk_len
extent[slice_dimension] = single_chunk_len
@@ -344,36 +438,48 @@ Source code for mala.datahandling.data_shuffler
<
for i in range(my_items_start, my_items_end):
# We check above that in the non-numpy case, OpenPMD will work.
dot.calculator.grid_dimensions = list(shuffle_dimensions)
- name_prefix = os.path.join(dot.save_path,
- save_name.replace("*", str(i)))
+ name_prefix = os.path.join(
+ dot.save_path, save_name.replace("*", str(i))
+ )
# do NOT open with MPI
shuffled_snapshot_series = io.Series(
name_prefix + dot.name_infix + file_ending,
io.Access.create,
options=json.dumps(
- self.parameters._configuration["openpmd_configuration"]))
- dot.calculator.\
- write_to_openpmd_file(shuffled_snapshot_series,
- PhysicalData.SkipArrayWriting(dataset, feature_size),
- additional_attributes={"global_shuffling_seed": self.parameters.shuffling_seed,
- "local_shuffling_seed": i*self.parameters.shuffling_seed},
- internal_iteration_number=i)
+ self.parameters._configuration["openpmd_configuration"]
+ ),
+ )
+ dot.calculator.write_to_openpmd_file(
+ shuffled_snapshot_series,
+ PhysicalData.SkipArrayWriting(dataset, feature_size),
+ additional_attributes={
+ "global_shuffling_seed": self.parameters.shuffling_seed,
+ "local_shuffling_seed": i * self.parameters.shuffling_seed,
+ },
+ internal_iteration_number=i,
+ )
mesh_out = shuffled_snapshot_series.write_iterations()[i].meshes[
- dot.calculator.data_name]
+ dot.calculator.data_name
+ ]
new_array = np.zeros(
(dot.dimension, int(np.prod(shuffle_dimensions))),
- dtype=dataset.dtype)
+ dtype=dataset.dtype,
+ )
# Need to add to these in the loop as the single chunks might have
# different sizes
to_chunk_offset, to_chunk_extent = 0, 0
for j in range(0, self.nr_snapshots):
- extent_in = self.parameters.snapshot_directories_list[j].grid_dimension
+ extent_in = self.parameters.snapshot_directories_list[
+ j
+ ].grid_dimension
if len(input_series_list[j].iterations) != 1:
raise Exception(
- "Input Series '{}' has {} iterations (needs exactly one)."
- .format(input_series_list[j].name,
- len(input_series_list[j].iterations)))
+ "Input Series '{}' has {} iterations (needs exactly one).".format(
+ input_series_list[j].name,
+ len(input_series_list[j].iterations),
+ )
+ )
for iteration in input_series_list[j].read_iterations():
mesh_in = iteration.meshes[dot.calculator.data_name]
break
@@ -384,19 +490,23 @@ Source code for mala.datahandling.data_shuffler
<
# in openPMD, to_chunk_extent describes the upper coordinate of
# the slice, as is usual in Python.
from_chunk_offset, from_chunk_extent = from_chunk_i(
- i, number_of_new_snapshots, extent_in)
+ i, number_of_new_snapshots, extent_in
+ )
to_chunk_offset = to_chunk_extent
to_chunk_extent = to_chunk_offset + np.prod(from_chunk_extent)
for dimension in range(len(mesh_in)):
mesh_in[str(dimension)].load_chunk(
new_array[dimension, to_chunk_offset:to_chunk_extent],
- from_chunk_offset, from_chunk_extent)
+ from_chunk_offset,
+ from_chunk_extent,
+ )
mesh_in.series_flush()
for k in range(feature_size):
rc = mesh_out[str(k)]
rc[:, :, :] = new_array[k, :][permutations[i]].reshape(
- shuffle_dimensions)
+ shuffle_dimensions
+ )
shuffled_snapshot_series.close()
# Ensure consistent parallel destruction
@@ -404,12 +514,14 @@ Source code for mala.datahandling.data_shuffler
<
for series in input_series_list:
series.close()
-[docs] def shuffle_snapshots(self,
- complete_save_path=None,
- descriptor_save_path=None,
- target_save_path=None,
- save_name="mala_shuffled_snapshot*",
- number_of_shuffled_snapshots=None):
+[docs] def shuffle_snapshots(
+ self,
+ complete_save_path=None,
+ descriptor_save_path=None,
+ target_save_path=None,
+ save_name="mala_shuffled_snapshot*",
+ number_of_shuffled_snapshots=None,
+ ):
"""
Shuffle the snapshots into new snapshots.
@@ -452,8 +564,9 @@ Source code for mala.datahandling.data_shuffler
<
import openpmd_api as io
if file_ending not in io.file_extensions:
- raise Exception("Invalid file ending selected: " +
- file_ending)
+ raise Exception(
+ "Invalid file ending selected: " + file_ending
+ )
else:
file_ending = "npy"
@@ -469,12 +582,15 @@ Source code for mala.datahandling.data_shuffler
<
if len(snapshot_types) > 1:
raise Exception(
"[data_shuffler] Can only deal with one type of input snapshot"
- + " at once (openPMD or numpy).")
+ + " at once (openPMD or numpy)."
+ )
snapshot_type = snapshot_types.pop()
del snapshot_types
- snapshot_size_list = [snapshot.grid_size for snapshot in
- self.parameters.snapshot_directories_list]
+ snapshot_size_list = [
+ snapshot.grid_size
+ for snapshot in self.parameters.snapshot_directories_list
+ ]
number_of_data_points = np.sum(snapshot_size_list)
if number_of_shuffled_snapshots is None:
@@ -483,8 +599,9 @@ Source code for mala.datahandling.data_shuffler
<
# If all snapshots have the same size, we can just replicate the
# snapshot structure.
if np.max(snapshot_size_list) == np.min(snapshot_size_list):
- shuffle_dimensions = self.parameters.\
- snapshot_directories_list[0].grid_dimension
+ shuffle_dimensions = self.parameters.snapshot_directories_list[
+ 0
+ ].grid_dimension
number_of_new_snapshots = self.nr_snapshots
else:
# If the snapshots have different sizes we simply create
@@ -494,30 +611,44 @@ Source code for mala.datahandling.data_shuffler
<
number_of_new_snapshots += 1
# If they do have different sizes, we start with the smallest
# snapshot, there is some padding down below anyhow.
- shuffle_dimensions = [int(number_of_data_points /
- number_of_new_snapshots), 1, 1]
+ shuffle_dimensions = [
+ int(number_of_data_points / number_of_new_snapshots),
+ 1,
+ 1,
+ ]
- if snapshot_type == 'openpmd':
+ if snapshot_type == "openpmd":
import math
import functools
+
number_of_new_snapshots = functools.reduce(
- math.gcd, [
- snapshot.grid_dimension[0] for snapshot in
- self.parameters.snapshot_directories_list
- ], number_of_new_snapshots)
+ math.gcd,
+ [
+ snapshot.grid_dimension[0]
+ for snapshot in self.parameters.snapshot_directories_list
+ ],
+ number_of_new_snapshots,
+ )
else:
number_of_new_snapshots = number_of_shuffled_snapshots
- if snapshot_type == 'openpmd':
+ if snapshot_type == "openpmd":
import math
import functools
+
specified_number_of_new_snapshots = number_of_new_snapshots
number_of_new_snapshots = functools.reduce(
- math.gcd, [
- snapshot.grid_dimension[0] for snapshot in
- self.parameters.snapshot_directories_list
- ], number_of_new_snapshots)
- if number_of_new_snapshots != specified_number_of_new_snapshots:
+ math.gcd,
+ [
+ snapshot.grid_dimension[0]
+ for snapshot in self.parameters.snapshot_directories_list
+ ],
+ number_of_new_snapshots,
+ )
+ if (
+ number_of_new_snapshots
+ != specified_number_of_new_snapshots
+ ):
print(
f"[openPMD shuffling] Reduced the number of output snapshots to "
f"{number_of_new_snapshots} because of the dataset dimensions."
@@ -525,14 +656,22 @@ Source code for mala.datahandling.data_shuffler
<
del specified_number_of_new_snapshots
if number_of_data_points % number_of_new_snapshots != 0:
- raise Exception("Cannot create this number of snapshots "
- "from data provided.")
+ raise Exception(
+ "Cannot create this number of snapshots "
+ "from data provided."
+ )
else:
- shuffle_dimensions = [int(number_of_data_points /
- number_of_new_snapshots), 1, 1]
-
- printout("Data shuffler will generate", number_of_new_snapshots,
- "new snapshots.")
+ shuffle_dimensions = [
+ int(number_of_data_points / number_of_new_snapshots),
+ 1,
+ 1,
+ ]
+
+ printout(
+ "Data shuffler will generate",
+ number_of_new_snapshots,
+ "new snapshots.",
+ )
printout("Shuffled snapshot dimension will be ", shuffle_dimensions)
# Prepare permutations.
@@ -542,34 +681,57 @@ Source code for mala.datahandling.data_shuffler
<
# This makes the shuffling deterministic, if specified by the user.
if self.parameters.shuffling_seed is not None:
- np.random.seed(i*self.parameters.shuffling_seed)
- permutations.append(np.random.permutation(
- int(np.prod(shuffle_dimensions))))
-
- if snapshot_type == 'numpy':
- self.__shuffle_numpy(number_of_new_snapshots, shuffle_dimensions,
- descriptor_save_path, save_name,
- target_save_path, permutations, file_ending)
- elif snapshot_type == 'openpmd':
+ np.random.seed(i * self.parameters.shuffling_seed)
+ permutations.append(
+ np.random.permutation(int(np.prod(shuffle_dimensions)))
+ )
+
+ if snapshot_type == "numpy":
+ self.__shuffle_numpy(
+ number_of_new_snapshots,
+ shuffle_dimensions,
+ descriptor_save_path,
+ save_name,
+ target_save_path,
+ permutations,
+ file_ending,
+ )
+ elif snapshot_type == "openpmd":
descriptor = self.__DescriptorOrTarget(
- descriptor_save_path, lambda x: x.input_npy_directory,
- lambda x: x.input_npy_file, self.descriptor_calculator, ".in.",
- self.input_dimension)
- self.__shuffle_openpmd(descriptor, number_of_new_snapshots,
- shuffle_dimensions, save_name, permutations,
- file_ending)
- target = self.__DescriptorOrTarget(target_save_path,
- lambda x: x.output_npy_directory,
- lambda x: x.output_npy_file,
- self.target_calculator, ".out.",
- self.output_dimension)
- self.__shuffle_openpmd(target, number_of_new_snapshots,
- shuffle_dimensions, save_name, permutations,
- file_ending)
+ descriptor_save_path,
+ lambda x: x.input_npy_directory,
+ lambda x: x.input_npy_file,
+ self.descriptor_calculator,
+ ".in.",
+ self.input_dimension,
+ )
+ self.__shuffle_openpmd(
+ descriptor,
+ number_of_new_snapshots,
+ shuffle_dimensions,
+ save_name,
+ permutations,
+ file_ending,
+ )
+ target = self.__DescriptorOrTarget(
+ target_save_path,
+ lambda x: x.output_npy_directory,
+ lambda x: x.output_npy_file,
+ self.target_calculator,
+ ".out.",
+ self.output_dimension,
+ )
+ self.__shuffle_openpmd(
+ target,
+ number_of_new_snapshots,
+ shuffle_dimensions,
+ save_name,
+ permutations,
+ file_ending,
+ )
else:
raise Exception("Unknown snapshot type: {}".format(snapshot_type))
-
# Since no training will be done with this class, we should always
# clear the data at the end.
self.clear_data()
diff --git a/_modules/mala/datahandling/fast_tensor_dataset.html b/_modules/mala/datahandling/fast_tensor_dataset.html
index da7dedb2f..115e91623 100644
--- a/_modules/mala/datahandling/fast_tensor_dataset.html
+++ b/_modules/mala/datahandling/fast_tensor_dataset.html
@@ -75,6 +75,7 @@
Source code for mala.datahandling.fast_tensor_dataset
"""A special type of tensor data set for improved performance."""
+
import numpy as np
import torch
@@ -111,7 +112,9 @@ Source code for mala.datahandling.fast_tensor_dataset
batch : tuple
The data tuple for this batch.
"""
- batch = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
+ batch = self.indices[
+ idx * self.batch_size : (idx + 1) * self.batch_size
+ ]
rv = tuple(t[batch, ...] for t in self.tensors)
return rv
diff --git a/_modules/mala/datahandling/lazy_load_dataset.html b/_modules/mala/datahandling/lazy_load_dataset.html
index 0b86df65d..a8c914b21 100644
--- a/_modules/mala/datahandling/lazy_load_dataset.html
+++ b/_modules/mala/datahandling/lazy_load_dataset.html
@@ -75,6 +75,7 @@
Source code for mala.datahandling.lazy_load_dataset
"""DataSet for lazy-loading."""
+
import os
try:
@@ -91,7 +92,7 @@ Source code for mala.datahandling.lazy_load_dataset
from mala.datahandling.snapshot import Snapshot
-[docs]class LazyLoadDataset(torch.utils.data.Dataset):
+[docs]class LazyLoadDataset(Dataset):
"""
DataSet class for lazy loading.
@@ -129,10 +130,17 @@ Source code for mala.datahandling.lazy_load_dataset
If True, then the gradient is stored for the inputs.
"""
- def __init__(self, input_dimension, output_dimension, input_data_scaler,
- output_data_scaler, descriptor_calculator,
- target_calculator, use_horovod,
- input_requires_grad=False):
+ def __init__(
+ self,
+ input_dimension,
+ output_dimension,
+ input_data_scaler,
+ output_data_scaler,
+ descriptor_calculator,
+ target_calculator,
+ use_horovod,
+ input_requires_grad=False,
+ ):
self.snapshot_list = []
self.input_dimension = input_dimension
self.output_dimension = output_dimension
@@ -142,8 +150,9 @@ Source code for mala.datahandling.lazy_load_dataset
self.target_calculator = target_calculator
self.number_of_snapshots = 0
self.total_size = 0
- self.descriptors_contain_xyz = self.descriptor_calculator.\
- descriptors_contain_xyz
+ self.descriptors_contain_xyz = (
+ self.descriptor_calculator.descriptors_contain_xyz
+ )
self.currently_loaded_file = None
self.input_data = np.empty(0)
self.output_data = np.empty(0)
@@ -205,44 +214,56 @@ Source code for mala.datahandling.lazy_load_dataset
"""
# Load the data into RAM.
if self.snapshot_list[file_index].snapshot_type == "numpy":
- self.input_data = self.descriptor_calculator. \
- read_from_numpy_file(
- os.path.join(self.snapshot_list[file_index].input_npy_directory,
- self.snapshot_list[file_index].input_npy_file),
- units=self.snapshot_list[file_index].input_units)
- self.output_data = self.target_calculator. \
- read_from_numpy_file(
- os.path.join(self.snapshot_list[file_index].output_npy_directory,
- self.snapshot_list[file_index].output_npy_file),
- units=self.snapshot_list[file_index].output_units)
+ self.input_data = self.descriptor_calculator.read_from_numpy_file(
+ os.path.join(
+ self.snapshot_list[file_index].input_npy_directory,
+ self.snapshot_list[file_index].input_npy_file,
+ ),
+ units=self.snapshot_list[file_index].input_units,
+ )
+ self.output_data = self.target_calculator.read_from_numpy_file(
+ os.path.join(
+ self.snapshot_list[file_index].output_npy_directory,
+ self.snapshot_list[file_index].output_npy_file,
+ ),
+ units=self.snapshot_list[file_index].output_units,
+ )
elif self.snapshot_list[file_index].snapshot_type == "openpmd":
- self.input_data = self.descriptor_calculator. \
- read_from_openpmd_file(
- os.path.join(self.snapshot_list[file_index].input_npy_directory,
- self.snapshot_list[file_index].input_npy_file))
- self.output_data = self.target_calculator. \
- read_from_openpmd_file(
- os.path.join(self.snapshot_list[file_index].output_npy_directory,
- self.snapshot_list[file_index].output_npy_file))
+ self.input_data = (
+ self.descriptor_calculator.read_from_openpmd_file(
+ os.path.join(
+ self.snapshot_list[file_index].input_npy_directory,
+ self.snapshot_list[file_index].input_npy_file,
+ )
+ )
+ )
+ self.output_data = self.target_calculator.read_from_openpmd_file(
+ os.path.join(
+ self.snapshot_list[file_index].output_npy_directory,
+ self.snapshot_list[file_index].output_npy_file,
+ )
+ )
# Transform the data.
- self.input_data = \
- self.input_data.reshape([self.snapshot_list[file_index].grid_size,
- self.input_dimension])
+ self.input_data = self.input_data.reshape(
+ [self.snapshot_list[file_index].grid_size, self.input_dimension]
+ )
if self.input_data.dtype != DEFAULT_NP_DATA_DTYPE:
self.input_data = self.input_data.astype(DEFAULT_NP_DATA_DTYPE)
self.input_data = torch.from_numpy(self.input_data).float()
self.input_data_scaler.transform(self.input_data)
self.input_data.requires_grad = self.input_requires_grad
- self.output_data = \
- self.output_data.reshape([self.snapshot_list[file_index].grid_size,
- self.output_dimension])
+ self.output_data = self.output_data.reshape(
+ [self.snapshot_list[file_index].grid_size, self.output_dimension]
+ )
if self.return_outputs_directly is False:
self.output_data = np.array(self.output_data)
if self.output_data.dtype != DEFAULT_NP_DATA_DTYPE:
- self.output_data = self.output_data.astype(DEFAULT_NP_DATA_DTYPE)
+ self.output_data = self.output_data.astype(
+ DEFAULT_NP_DATA_DTYPE
+ )
self.output_data = torch.from_numpy(self.output_data).float()
self.output_data_scaler.transform(self.output_data)
@@ -258,9 +279,11 @@ Source code for mala.datahandling.lazy_load_dataset
file_index = i
# From the end of previous file to beginning of new.
- if index_in_file == self.snapshot_list[i].grid_size and \
- is_start:
- file_index = i+1
+ if (
+ index_in_file == self.snapshot_list[i].grid_size
+ and is_start
+ ):
+ file_index = i + 1
index_in_file = 0
break
else:
@@ -297,35 +320,44 @@ Source code for mala.datahandling.lazy_load_dataset
# Find out if new data is needed.
if file_index != self.currently_loaded_file:
self.get_new_data(file_index)
- return self.input_data[index_in_file], \
- self.output_data[index_in_file]
+ return (
+ self.input_data[index_in_file],
+ self.output_data[index_in_file],
+ )
elif isinstance(idx, slice):
# If a slice is requested, we have to find out if it spans files.
- file_index_start, index_in_file_start = self.\
- _get_file_index(idx.start, is_slice=True, is_start=True)
- file_index_stop, index_in_file_stop = self.\
- _get_file_index(idx.stop, is_slice=True)
+ file_index_start, index_in_file_start = self._get_file_index(
+ idx.start, is_slice=True, is_start=True
+ )
+ file_index_stop, index_in_file_stop = self._get_file_index(
+ idx.stop, is_slice=True
+ )
# If it does, we cannot deliver.
# Take care though, if a full snapshot is requested,
# the stop index will point to the wrong file.
if file_index_start != file_index_stop:
if index_in_file_stop == 0:
- index_in_file_stop = self.snapshot_list[file_index_stop].\
- grid_size
+ index_in_file_stop = self.snapshot_list[
+ file_index_stop
+ ].grid_size
else:
- raise Exception("Lazy loading currently only supports "
- "slices in one file. "
- "You have requested a slice over two "
- "files.")
+ raise Exception(
+ "Lazy loading currently only supports "
+ "slices in one file. "
+ "You have requested a slice over two "
+ "files."
+ )
# Find out if new data is needed.
file_index = file_index_start
if file_index != self.currently_loaded_file:
self.get_new_data(file_index)
- return self.input_data[index_in_file_start:index_in_file_stop], \
- self.output_data[index_in_file_start:index_in_file_stop]
+ return (
+ self.input_data[index_in_file_start:index_in_file_stop],
+ self.output_data[index_in_file_start:index_in_file_stop],
+ )
else:
raise Exception("Invalid idx provided.")
diff --git a/_modules/mala/datahandling/lazy_load_dataset_single.html b/_modules/mala/datahandling/lazy_load_dataset_single.html
index 7201b770c..88eab43e0 100644
--- a/_modules/mala/datahandling/lazy_load_dataset_single.html
+++ b/_modules/mala/datahandling/lazy_load_dataset_single.html
@@ -75,15 +75,16 @@
Source code for mala.datahandling.lazy_load_dataset_single
"""DataSet for lazy-loading."""
+
import os
from multiprocessing import shared_memory
import numpy as np
import torch
-from torch.utils.data import Dataset, DataLoader
+from torch.utils.data import Dataset
-[docs]class LazyLoadDatasetSingle(torch.utils.data.Dataset):
+[docs]class LazyLoadDatasetSingle(Dataset):
"""
DataSet class for lazy loading.
@@ -121,10 +122,19 @@ Source code for mala.datahandling.lazy_load_dataset_single
If True, then the gradient is stored for the inputs.
"""
- def __init__(self, batch_size, snapshot, input_dimension, output_dimension,
- input_data_scaler, output_data_scaler, descriptor_calculator,
- target_calculator, use_horovod,
- input_requires_grad=False):
+ def __init__(
+ self,
+ batch_size,
+ snapshot,
+ input_dimension,
+ output_dimension,
+ input_data_scaler,
+ output_data_scaler,
+ descriptor_calculator,
+ target_calculator,
+ use_horovod,
+ input_requires_grad=False,
+ ):
self.snapshot = snapshot
self.input_dimension = input_dimension
self.output_dimension = output_dimension
@@ -134,8 +144,9 @@ Source code for mala.datahandling.lazy_load_dataset_single
self.target_calculator = target_calculator
self.number_of_snapshots = 0
self.total_size = 0
- self.descriptors_contain_xyz = self.descriptor_calculator.\
- descriptors_contain_xyz
+ self.descriptors_contain_xyz = (
+ self.descriptor_calculator.descriptors_contain_xyz
+ )
self.currently_loaded_file = None
self.input_data = np.empty(0)
self.output_data = np.empty(0)
@@ -159,25 +170,45 @@ Source code for mala.datahandling.lazy_load_dataset_single
"""
# Get array shape and data types
if self.snapshot.snapshot_type == "numpy":
- self.input_shape, self.input_dtype = self.descriptor_calculator. \
- read_dimensions_from_numpy_file(
- os.path.join(self.snapshot.input_npy_directory,
- self.snapshot.input_npy_file), read_dtype=True)
-
- self.output_shape, self.output_dtype = self.target_calculator. \
- read_dimensions_from_numpy_file(
- os.path.join(self.snapshot.output_npy_directory,
- self.snapshot.output_npy_file), read_dtype=True)
+ self.input_shape, self.input_dtype = (
+ self.descriptor_calculator.read_dimensions_from_numpy_file(
+ os.path.join(
+ self.snapshot.input_npy_directory,
+ self.snapshot.input_npy_file,
+ ),
+ read_dtype=True,
+ )
+ )
+
+ self.output_shape, self.output_dtype = (
+ self.target_calculator.read_dimensions_from_numpy_file(
+ os.path.join(
+ self.snapshot.output_npy_directory,
+ self.snapshot.output_npy_file,
+ ),
+ read_dtype=True,
+ )
+ )
elif self.snapshot.snapshot_type == "openpmd":
- self.input_shape, self.input_dtype = self.descriptor_calculator. \
- read_dimensions_from_openpmd_file(
- os.path.join(self.snapshot.input_npy_directory,
- self.snapshot.input_npy_file), read_dtype=True)
-
- self.output_shape, self.output_dtype = self.target_calculator. \
- read_dimensions_from_openpmd_file(
- os.path.join(self.snapshot.output_npy_directory,
- self.snapshot.output_npy_file), read_dtype=True)
+ self.input_shape, self.input_dtype = (
+ self.descriptor_calculator.read_dimensions_from_openpmd_file(
+ os.path.join(
+ self.snapshot.input_npy_directory,
+ self.snapshot.input_npy_file,
+ ),
+ read_dtype=True,
+ )
+ )
+
+ self.output_shape, self.output_dtype = (
+ self.target_calculator.read_dimensions_from_openpmd_file(
+ os.path.join(
+ self.snapshot.output_npy_directory,
+ self.snapshot.output_npy_file,
+ ),
+ read_dtype=True,
+ )
+ )
else:
raise Exception("Invalid snapshot type selected.")
@@ -185,8 +216,9 @@ Source code for mala.datahandling.lazy_load_dataset_single
# usage to data in FP32 type (which is a good idea anyway to save
# memory)
if self.input_dtype != np.float32 or self.output_dtype != np.float32:
- raise Exception("LazyLoadDatasetSingle requires numpy data in "
- "FP32.")
+ raise Exception(
+ "LazyLoadDatasetSingle requires numpy data in FP32."
+ )
# Allocate shared memory buffer
input_bytes = self.input_dtype.itemsize * np.prod(self.input_shape)
@@ -240,16 +272,22 @@ Source code for mala.datahandling.lazy_load_dataset_single
input_shm = shared_memory.SharedMemory(name=self.input_shm_name)
output_shm = shared_memory.SharedMemory(name=self.output_shm_name)
- input_data = np.ndarray(shape=[self.snapshot.grid_size,
- self.input_dimension],
- dtype=np.float32, buffer=input_shm.buf)
- output_data = np.ndarray(shape=[self.snapshot.grid_size,
- self.output_dimension],
- dtype=np.float32, buffer=output_shm.buf)
- if idx == self.len-1:
- batch = self.indices[idx * self.batch_size:]
+ input_data = np.ndarray(
+ shape=[self.snapshot.grid_size, self.input_dimension],
+ dtype=np.float32,
+ buffer=input_shm.buf,
+ )
+ output_data = np.ndarray(
+ shape=[self.snapshot.grid_size, self.output_dimension],
+ dtype=np.float32,
+ buffer=output_shm.buf,
+ )
+ if idx == self.len - 1:
+ batch = self.indices[idx * self.batch_size :]
else:
- batch = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
+ batch = self.indices[
+ idx * self.batch_size : (idx + 1) * self.batch_size
+ ]
# print(batch.shape)
input_batch = input_data[batch, ...]
@@ -296,7 +334,6 @@ Source code for mala.datahandling.lazy_load_dataset_single
single dataset object is used back to back.
"""
np.random.shuffle(self.indices)
-
diff --git a/_modules/mala/datahandling/multi_lazy_load_data_loader.html b/_modules/mala/datahandling/multi_lazy_load_data_loader.html
index e2b0060b0..b07bea729 100644
--- a/_modules/mala/datahandling/multi_lazy_load_data_loader.html
+++ b/_modules/mala/datahandling/multi_lazy_load_data_loader.html
@@ -75,6 +75,7 @@
Source code for mala.datahandling.multi_lazy_load_data_loader
"""Class for loading multiple data sets with pre-fetching."""
+
import os
import numpy as np
@@ -98,26 +99,27 @@ Source code for mala.datahandling.multi_lazy_load_data_loader
self.datasets = datasets
self.loaders = []
for d in datasets:
- self.loaders.append(DataLoader(d,
- batch_size=None,
- **kwargs,
- shuffle=False))
+ self.loaders.append(
+ DataLoader(d, batch_size=None, **kwargs, shuffle=False)
+ )
# Create single process pool for prefetching
# Can use ThreadPoolExecutor for debugging.
- #self.pool = concurrent.futures.ThreadPoolExecutor(1)
+ # self.pool = concurrent.futures.ThreadPoolExecutor(1)
self.pool = concurrent.futures.ProcessPoolExecutor(1)
# Allocate shared memory and commence file load for first
# dataset in list
dset = self.datasets[0]
dset.allocate_shared_mem()
- self.load_future = self.pool.submit(self.load_snapshot_to_shm,
- dset.snapshot,
- dset.descriptor_calculator,
- dset.target_calculator,
- dset.input_shm_name,
- dset.output_shm_name)
+ self.load_future = self.pool.submit(
+ self.load_snapshot_to_shm,
+ dset.snapshot,
+ dset.descriptor_calculator,
+ dset.target_calculator,
+ dset.input_shm_name,
+ dset.output_shm_name,
+ )
def __len__(self):
"""
@@ -169,13 +171,15 @@ Source code for mala.datahandling.multi_lazy_load_data_loader
# Prefetch next file (looping around epoch boundary)
dset = self.datasets[self.count % len(self.loaders)]
if not dset.loaded:
- dset.allocate_shared_mem()
- self.load_future = self.pool.submit(self.load_snapshot_to_shm,
- dset.snapshot,
- dset.descriptor_calculator,
- dset.target_calculator,
- dset.input_shm_name,
- dset.output_shm_name)
+ dset.allocate_shared_mem()
+ self.load_future = self.pool.submit(
+ self.load_snapshot_to_shm,
+ dset.snapshot,
+ dset.descriptor_calculator,
+ dset.target_calculator,
+ dset.input_shm_name,
+ dset.output_shm_name,
+ )
# Return current
return self.loaders[self.count - 1]
@@ -193,8 +197,13 @@ Source code for mala.datahandling.multi_lazy_load_data_loader
# Worker function to load data into shared memory (limited to numpy files
# only for now)
[docs] @staticmethod
- def load_snapshot_to_shm(snapshot, descriptor_calculator, target_calculator,
- input_shm_name, output_shm_name):
+ def load_snapshot_to_shm(
+ snapshot,
+ descriptor_calculator,
+ target_calculator,
+ input_shm_name,
+ output_shm_name,
+ ):
"""
Load a snapshot into shared memory.
@@ -222,61 +231,85 @@ Source code for mala.datahandling.multi_lazy_load_data_loader
output_shm = shared_memory.SharedMemory(name=output_shm_name)
if snapshot.snapshot_type == "numpy":
- input_shape, input_dtype = descriptor_calculator. \
- read_dimensions_from_numpy_file(
- os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file), read_dtype=True)
-
- output_shape, output_dtype = target_calculator. \
- read_dimensions_from_numpy_file(
- os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file), read_dtype=True)
+ input_shape, input_dtype = (
+ descriptor_calculator.read_dimensions_from_numpy_file(
+ os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ ),
+ read_dtype=True,
+ )
+ )
+
+ output_shape, output_dtype = (
+ target_calculator.read_dimensions_from_numpy_file(
+ os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ ),
+ read_dtype=True,
+ )
+ )
elif snapshot.snapshot_type == "openpmd":
- input_shape, input_dtype = descriptor_calculator. \
- read_dimensions_from_openpmd_file(
- os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file), read_dtype=True)
-
- output_shape, output_dtype = target_calculator. \
- read_dimensions_from_openpmd_file(
- os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file), read_dtype=True)
+ input_shape, input_dtype = (
+ descriptor_calculator.read_dimensions_from_openpmd_file(
+ os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ ),
+ read_dtype=True,
+ )
+ )
+
+ output_shape, output_dtype = (
+ target_calculator.read_dimensions_from_openpmd_file(
+ os.path.join(
+ snapshot.output_npy_directory,
+ snapshot.output_npy_file,
+ ),
+ read_dtype=True,
+ )
+ )
else:
raise Exception("Invalid snapshot type selected.")
# Form numpy arrays from shm buffers
- input_data = np.ndarray(shape=input_shape, dtype=input_dtype,
- buffer=input_shm.buf)
- output_data = np.ndarray(shape=output_shape, dtype=output_dtype,
- buffer=output_shm.buf)
+ input_data = np.ndarray(
+ shape=input_shape, dtype=input_dtype, buffer=input_shm.buf
+ )
+ output_data = np.ndarray(
+ shape=output_shape, dtype=output_dtype, buffer=output_shm.buf
+ )
# Load numpy data into shm buffers
if snapshot.snapshot_type == "numpy":
- descriptor_calculator. \
- read_from_numpy_file(
- os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file),
+ descriptor_calculator.read_from_numpy_file(
+ os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ ),
units=snapshot.input_units,
- array=input_data)
- target_calculator. \
- read_from_numpy_file(
- os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file),
+ array=input_data,
+ )
+ target_calculator.read_from_numpy_file(
+ os.path.join(
+ snapshot.output_npy_directory, snapshot.output_npy_file
+ ),
units=snapshot.output_units,
- array=output_data)
- else :
- descriptor_calculator. \
- read_from_openpmd_file(
- os.path.join(snapshot.input_npy_directory,
- snapshot.input_npy_file),
+ array=output_data,
+ )
+ else:
+ descriptor_calculator.read_from_openpmd_file(
+ os.path.join(
+ snapshot.input_npy_directory, snapshot.input_npy_file
+ ),
units=snapshot.input_units,
- array=input_data)
- target_calculator. \
- read_from_openpmd_file(
- os.path.join(snapshot.output_npy_directory,
- snapshot.output_npy_file),
+ array=input_data,
+ )
+ target_calculator.read_from_openpmd_file(
+ os.path.join(
+ snapshot.output_npy_directory, snapshot.output_npy_file
+ ),
units=snapshot.output_units,
- array=output_data)
+ array=output_data,
+ )
# This function only loads the numpy data with scaling. Remaining data
# preprocessing occurs in __getitem__ of LazyLoadDatasetSingle
diff --git a/_modules/mala/datahandling/snapshot.html b/_modules/mala/datahandling/snapshot.html
index e4ff830fe..289202789 100644
--- a/_modules/mala/datahandling/snapshot.html
+++ b/_modules/mala/datahandling/snapshot.html
@@ -75,9 +75,6 @@
Source code for mala.datahandling.snapshot
"""Represents an entire atomic snapshot (including descriptor/target data)."""
-from os.path import join
-
-import numpy as np
from mala.common.json_serializable import JSONSerializable
@@ -126,12 +123,18 @@ Source code for mala.datahandling.snapshot
Default is None.
"""
- def __init__(self, input_npy_file, input_npy_directory,
- output_npy_file, output_npy_directory,
- snapshot_function,
- input_units="", output_units="",
- calculation_output="",
- snapshot_type="openpmd"):
+ def __init__(
+ self,
+ input_npy_file,
+ input_npy_directory,
+ output_npy_file,
+ output_npy_directory,
+ snapshot_function,
+ input_units="",
+ output_units="",
+ calculation_output="",
+ snapshot_type="openpmd",
+ ):
super(Snapshot, self).__init__()
# Inputs.
@@ -177,12 +180,14 @@ Source code for mala.datahandling.snapshot
The object as read from the JSON file.
"""
- deserialized_object = cls(json_dict["input_npy_file"],
- json_dict["input_npy_directory"],
- json_dict["output_npy_file"],
- json_dict["output_npy_directory"],
- json_dict["snapshot_function"],
- json_dict["snapshot_type"])
+ deserialized_object = cls(
+ json_dict["input_npy_file"],
+ json_dict["input_npy_directory"],
+ json_dict["output_npy_file"],
+ json_dict["output_npy_directory"],
+ json_dict["snapshot_function"],
+ json_dict["snapshot_type"],
+ )
for key in json_dict:
setattr(deserialized_object, key, json_dict[key])
return deserialized_object
diff --git a/_modules/mala/descriptors/atomic_density.html b/_modules/mala/descriptors/atomic_density.html
index de3e79b06..1b1e30805 100644
--- a/_modules/mala/descriptors/atomic_density.html
+++ b/_modules/mala/descriptors/atomic_density.html
@@ -75,20 +75,12 @@
Source code for mala.descriptors.atomic_density
"""Gaussian descriptor class."""
+
import os
import ase
import ase.io
-try:
- from lammps import lammps
- # For version compatibility; older lammps versions (the serial version
- # we still use on some machines) do not have these constants.
- try:
- from lammps import constants as lammps_constants
- except ImportError:
- pass
-except ModuleNotFoundError:
- pass
+from importlib.util import find_spec
import numpy as np
from scipy.spatial import distance
@@ -192,24 +184,30 @@ Source code for mala.descriptors.atomic_density
<
optimal_sigma : float
The optimal sigma value.
"""
- return (np.max(voxel) / reference_grid_spacing_aluminium) * \
- optimal_sigma_aluminium
+ return (
+ np.max(voxel) / reference_grid_spacing_aluminium
+ ) * optimal_sigma_aluminium
def _calculate(self, outdir, **kwargs):
if self.parameters._configuration["lammps"]:
- try:
- from lammps import lammps
- except ModuleNotFoundError:
- printout("No LAMMPS found for descriptor calculation, "
- "falling back to python.")
+ if find_spec("lammps") is None:
+ printout(
+ "No LAMMPS found for descriptor calculation, "
+ "falling back to python."
+ )
return self.__calculate_python(**kwargs)
-
- return self.__calculate_lammps(outdir, **kwargs)
+ else:
+ return self.__calculate_lammps(outdir, **kwargs)
else:
return self.__calculate_python(**kwargs)
def __calculate_lammps(self, outdir, **kwargs):
"""Perform actual Gaussian descriptor calculation."""
+ # For version compatibility; older lammps versions (the serial version
+ # we still use on some machines) have these constants as part of the
+ # general LAMMPS import.
+ from lammps import constants as lammps_constants
+
use_fp64 = kwargs.get("use_fp64", False)
return_directly = kwargs.get("return_directly", False)
@@ -224,16 +222,23 @@ Source code for mala.descriptors.atomic_density
<
# Check if we have to determine the optimal sigma value.
if self.parameters.atomic_density_sigma is None:
self.grid_dimensions = [nx, ny, nz]
- self.parameters.atomic_density_sigma = self.\
- get_optimal_sigma(self.voxel)
+ self.parameters.atomic_density_sigma = self.get_optimal_sigma(
+ self.voxel
+ )
# Create LAMMPS instance.
lammps_dict = {}
lammps_dict["sigma"] = self.parameters.atomic_density_sigma
lammps_dict["rcutfac"] = self.parameters.atomic_density_cutoff
lammps_dict["atom_config_fname"] = ase_out_path
- lmp = self._setup_lammps(nx, ny, nz, outdir, lammps_dict,
- log_file_name="lammps_ggrid_log.tmp")
+ lmp = self._setup_lammps(
+ nx,
+ ny,
+ nz,
+ outdir,
+ lammps_dict,
+ log_file_name="lammps_ggrid_log.tmp",
+ )
# For now the file is chosen automatically, because this is used
# mostly under the hood anyway.
@@ -248,18 +253,27 @@ Source code for mala.descriptors.atomic_density
<
lmp.file(runfile)
# Extract the data.
- nrows_ggrid = extract_compute_np(lmp, "ggrid",
- lammps_constants.LMP_STYLE_LOCAL,
- lammps_constants.LMP_SIZE_ROWS)
- ncols_ggrid = extract_compute_np(lmp, "ggrid",
- lammps_constants.LMP_STYLE_LOCAL,
- lammps_constants.LMP_SIZE_COLS)
-
- gaussian_descriptors_np = \
- extract_compute_np(lmp, "ggrid",
- lammps_constants.LMP_STYLE_LOCAL, 2,
- array_shape=(nrows_ggrid, ncols_ggrid),
- use_fp64=use_fp64)
+ nrows_ggrid = extract_compute_np(
+ lmp,
+ "ggrid",
+ lammps_constants.LMP_STYLE_LOCAL,
+ lammps_constants.LMP_SIZE_ROWS,
+ )
+ ncols_ggrid = extract_compute_np(
+ lmp,
+ "ggrid",
+ lammps_constants.LMP_STYLE_LOCAL,
+ lammps_constants.LMP_SIZE_COLS,
+ )
+
+ gaussian_descriptors_np = extract_compute_np(
+ lmp,
+ "ggrid",
+ lammps_constants.LMP_STYLE_LOCAL,
+ 2,
+ array_shape=(nrows_ggrid, ncols_ggrid),
+ use_fp64=use_fp64,
+ )
lmp.close()
# In comparison to SNAP, the atomic density always returns
@@ -283,21 +297,23 @@ Source code for mala.descriptors.atomic_density
<
# Here, we want to do something else with the atomic density,
# and thus have to properly reorder it.
# We have to switch from x fastest to z fastest reordering.
- gaussian_descriptors_np = \
- gaussian_descriptors_np.reshape((self.grid_dimensions[2],
- self.grid_dimensions[1],
- self.grid_dimensions[0],
- 7))
- gaussian_descriptors_np = \
- gaussian_descriptors_np.transpose([2, 1, 0, 3])
+ gaussian_descriptors_np = gaussian_descriptors_np.reshape(
+ (
+ self.grid_dimensions[2],
+ self.grid_dimensions[1],
+ self.grid_dimensions[0],
+ 7,
+ )
+ )
+ gaussian_descriptors_np = gaussian_descriptors_np.transpose(
+ [2, 1, 0, 3]
+ )
if self.parameters.descriptors_contain_xyz:
self.fingerprint_length = 4
- return gaussian_descriptors_np[:, :, :, 3:], \
- nx*ny*nz
+ return gaussian_descriptors_np[:, :, :, 3:], nx * ny * nz
else:
self.fingerprint_length = 1
- return gaussian_descriptors_np[:, :, :, 6:], \
- nx*ny*nz
+ return gaussian_descriptors_np[:, :, :, 6:], nx * ny * nz
def __calculate_python(self, **kwargs):
"""
@@ -316,26 +332,42 @@ Source code for mala.descriptors.atomic_density
<
- It only works for ONE chemical element
- It has no MPI or GPU support
"""
- printout("Using python for descriptor calculation. "
- "The resulting calculation will be slow for "
- "large systems.")
-
- gaussian_descriptors_np = np.zeros((self.grid_dimensions[0],
- self.grid_dimensions[1],
- self.grid_dimensions[2], 4),
- dtype=np.float64)
+ printout(
+ "Using python for descriptor calculation. "
+ "The resulting calculation will be slow for "
+ "large systems."
+ )
+
+ gaussian_descriptors_np = np.zeros(
+ (
+ self.grid_dimensions[0],
+ self.grid_dimensions[1],
+ self.grid_dimensions[2],
+ 4,
+ ),
+ dtype=np.float64,
+ )
# Construct the hyperparameters to calculate the Gaussians.
# This follows the implementation in the LAMMPS code.
if self.parameters.atomic_density_sigma is None:
- self.parameters.atomic_density_sigma = self.\
- get_optimal_sigma(self.voxel)
- cutoff_squared = self.parameters.atomic_density_cutoff * \
+ self.parameters.atomic_density_sigma = self.get_optimal_sigma(
+ self.voxel
+ )
+ cutoff_squared = (
self.parameters.atomic_density_cutoff
- prefactor = 1.0 / (np.power(self.parameters.atomic_density_sigma *
- np.sqrt(2*np.pi),3))
- argumentfactor = 1.0 / (2.0 * self.parameters.atomic_density_sigma *
- self.parameters.atomic_density_sigma)
+ * self.parameters.atomic_density_cutoff
+ )
+ prefactor = 1.0 / (
+ np.power(
+ self.parameters.atomic_density_sigma * np.sqrt(2 * np.pi), 3
+ )
+ )
+ argumentfactor = 1.0 / (
+ 2.0
+ * self.parameters.atomic_density_sigma
+ * self.parameters.atomic_density_sigma
+ )
# Create a list of all potentially relevant atoms.
all_atoms = self._setup_atom_list()
@@ -351,25 +383,30 @@ Source code for mala.descriptors.atomic_density
<
for j in range(0, self.grid_dimensions[1]):
for k in range(0, self.grid_dimensions[2]):
# Compute the grid.
- gaussian_descriptors_np[i, j, k, 0:3] = \
+ gaussian_descriptors_np[i, j, k, 0:3] = (
self._grid_to_coord([i, j, k])
+ )
# Compute the Gaussian descriptors.
- dm = np.squeeze(distance.cdist(
- [gaussian_descriptors_np[i, j, k, 0:3]],
- all_atoms))
- dm = dm*dm
+ dm = np.squeeze(
+ distance.cdist(
+ [gaussian_descriptors_np[i, j, k, 0:3]], all_atoms
+ )
+ )
+ dm = dm * dm
dm_cutoff = dm[np.argwhere(dm < cutoff_squared)]
- gaussian_descriptors_np[i, j, k, 3] += \
- np.sum(prefactor*np.exp(-dm_cutoff*argumentfactor))
+ gaussian_descriptors_np[i, j, k, 3] += np.sum(
+ prefactor * np.exp(-dm_cutoff * argumentfactor)
+ )
if self.parameters.descriptors_contain_xyz:
self.fingerprint_length = 4
return gaussian_descriptors_np, np.prod(self.grid_dimensions)
else:
self.fingerprint_length = 1
- return gaussian_descriptors_np[:, :, :, 3:], \
- np.prod(self.grid_dimensions)
+ return gaussian_descriptors_np[:, :, :, 3:], np.prod(
+ self.grid_dimensions
+ )
diff --git a/_modules/mala/descriptors/bispectrum.html b/_modules/mala/descriptors/bispectrum.html
index 3de9ba2a0..8de4820ae 100644
--- a/_modules/mala/descriptors/bispectrum.html
+++ b/_modules/mala/descriptors/bispectrum.html
@@ -75,20 +75,13 @@
Source code for mala.descriptors.bispectrum
"""Bispectrum descriptor class."""
+
import os
import ase
import ase.io
-try:
- from lammps import lammps
- # For version compatibility; older lammps versions (the serial version
- # we still use on some machines) do not have these constants.
- try:
- from lammps import constants as lammps_constants
- except ImportError:
- pass
-except ModuleNotFoundError:
- pass
+
+from importlib.util import find_spec
import numpy as np
from scipy.spatial import distance
@@ -196,16 +189,15 @@ Source code for mala.descriptors.bispectrum
raise Exception("Unsupported unit for bispectrum descriptors.")
def _calculate(self, outdir, **kwargs):
-
if self.parameters._configuration["lammps"]:
- try:
- from lammps import lammps
- except ModuleNotFoundError:
- printout("No LAMMPS found for descriptor calculation, "
- "falling back to python.")
+ if find_spec("lammps") is None:
+ printout(
+ "No LAMMPS found for descriptor calculation, "
+ "falling back to python."
+ )
return self.__calculate_python(**kwargs)
-
- return self.__calculate_lammps(outdir, **kwargs)
+ else:
+ return self.__calculate_lammps(outdir, **kwargs)
else:
return self.__calculate_python(**kwargs)
@@ -216,6 +208,11 @@ Source code for mala.descriptors.bispectrum
Creates a LAMMPS instance with appropriate call parameters and uses
it for the calculation.
"""
+ # For version compatibility; older lammps versions (the serial version
+ # we still use on some machines) have these constants as part of the
+ # general LAMMPS import.
+ from lammps import constants as lammps_constants
+
use_fp64 = kwargs.get("use_fp64", False)
lammps_format = "lammps-data"
@@ -227,11 +224,19 @@ Source code for mala.descriptors.bispectrum
nz = self.grid_dimensions[2]
# Create LAMMPS instance.
- lammps_dict = {"twojmax": self.parameters.bispectrum_twojmax,
- "rcutfac": self.parameters.bispectrum_cutoff,
- "atom_config_fname": ase_out_path}
- lmp = self._setup_lammps(nx, ny, nz, outdir, lammps_dict,
- log_file_name="lammps_bgrid_log.tmp")
+ lammps_dict = {
+ "twojmax": self.parameters.bispectrum_twojmax,
+ "rcutfac": self.parameters.bispectrum_cutoff,
+ "atom_config_fname": ase_out_path,
+ }
+ lmp = self._setup_lammps(
+ nx,
+ ny,
+ nz,
+ outdir,
+ lammps_dict,
+ log_file_name="lammps_bgrid_log.tmp",
+ )
# An empty string means that the user wants to use the standard input.
# What that is differs depending on serial/parallel execution.
@@ -239,15 +244,17 @@ Source code for mala.descriptors.bispectrum
filepath = __file__.split("bispectrum")[0]
if self.parameters._configuration["mpi"]:
if self.parameters.use_z_splitting:
- self.parameters.lammps_compute_file = \
- os.path.join(filepath, "in.bgridlocal.python")
+ self.parameters.lammps_compute_file = os.path.join(
+ filepath, "in.bgridlocal.python"
+ )
else:
- self.parameters.lammps_compute_file = \
- os.path.join(filepath,
- "in.bgridlocal_defaultproc.python")
+ self.parameters.lammps_compute_file = os.path.join(
+ filepath, "in.bgridlocal_defaultproc.python"
+ )
else:
- self.parameters.lammps_compute_file = \
- os.path.join(filepath, "in.bgrid.python")
+ self.parameters.lammps_compute_file = os.path.join(
+ filepath, "in.bgrid.python"
+ )
# Do the LAMMPS calculation.
lmp.file(self.parameters.lammps_compute_file)
@@ -257,11 +264,13 @@ Source code for mala.descriptors.bispectrum
ncols0 = 3
# Analytical relation for fingerprint length
- ncoeff = (self.parameters.bispectrum_twojmax + 2) * \
- (self.parameters.bispectrum_twojmax + 3) * \
- (self.parameters.bispectrum_twojmax + 4)
- ncoeff = ncoeff // 24 # integer division
- self.fingerprint_length = ncols0+ncoeff
+ ncoeff = (
+ (self.parameters.bispectrum_twojmax + 2)
+ * (self.parameters.bispectrum_twojmax + 3)
+ * (self.parameters.bispectrum_twojmax + 4)
+ )
+ ncoeff = ncoeff // 24 # integer division
+ self.fingerprint_length = ncols0 + ncoeff
# Extract data from LAMMPS calculation.
# This is different for the parallel and the serial case.
@@ -269,20 +278,29 @@ Source code for mala.descriptors.bispectrum
# the end of this function.
# This is not necessarily true for the parallel case.
if self.parameters._configuration["mpi"]:
- nrows_local = extract_compute_np(lmp, "bgridlocal",
- lammps_constants.LMP_STYLE_LOCAL,
- lammps_constants.LMP_SIZE_ROWS)
- ncols_local = extract_compute_np(lmp, "bgridlocal",
- lammps_constants.LMP_STYLE_LOCAL,
- lammps_constants.LMP_SIZE_COLS)
+ nrows_local = extract_compute_np(
+ lmp,
+ "bgridlocal",
+ lammps_constants.LMP_STYLE_LOCAL,
+ lammps_constants.LMP_SIZE_ROWS,
+ )
+ ncols_local = extract_compute_np(
+ lmp,
+ "bgridlocal",
+ lammps_constants.LMP_STYLE_LOCAL,
+ lammps_constants.LMP_SIZE_COLS,
+ )
if ncols_local != self.fingerprint_length + 3:
raise Exception("Inconsistent number of features.")
- snap_descriptors_np = \
- extract_compute_np(lmp, "bgridlocal",
- lammps_constants.LMP_STYLE_LOCAL, 2,
- array_shape=(nrows_local, ncols_local),
- use_fp64=use_fp64)
+ snap_descriptors_np = extract_compute_np(
+ lmp,
+ "bgridlocal",
+ lammps_constants.LMP_STYLE_LOCAL,
+ 2,
+ array_shape=(nrows_local, ncols_local),
+ use_fp64=use_fp64,
+ )
lmp.close()
# Copy the grid dimensions only at the end.
@@ -291,10 +309,14 @@ Source code for mala.descriptors.bispectrum
else:
# Extract data from LAMMPS calculation.
- snap_descriptors_np = \
- extract_compute_np(lmp, "bgrid", 0, 2,
- (nz, ny, nx, self.fingerprint_length),
- use_fp64=use_fp64)
+ snap_descriptors_np = extract_compute_np(
+ lmp,
+ "bgrid",
+ 0,
+ 2,
+ (nz, ny, nx, self.fingerprint_length),
+ use_fp64=use_fp64,
+ )
lmp.close()
# switch from x-fastest to z-fastest order (swaps 0th and 2nd
@@ -303,9 +325,9 @@ Source code for mala.descriptors.bispectrum
# Copy the grid dimensions only at the end.
self.grid_dimensions = [nx, ny, nz]
if self.parameters.descriptors_contain_xyz:
- return snap_descriptors_np, nx*ny*nz
+ return snap_descriptors_np, nx * ny * nz
else:
- return snap_descriptors_np[:, :, :, 3:], nx*ny*nz
+ return snap_descriptors_np[:, :, :, 3:], nx * ny * nz
def __calculate_python(self, **kwargs):
"""
@@ -329,14 +351,17 @@ Source code for mala.descriptors.bispectrum
hard codes them. Compared to the LAMMPS implementation, some
essentially never used options are not maintained/optimized.
"""
- printout("Using python for descriptor calculation. "
- "The resulting calculation will be slow for "
- "large systems.")
+ printout(
+ "Using python for descriptor calculation. "
+ "The resulting calculation will be slow for "
+ "large systems."
+ )
# The entire bispectrum calculation may be extensively profiled.
profile_calculation = kwargs.get("profile_calculation", False)
if profile_calculation:
import time
+
timing_distances = 0
timing_ui = 0
timing_zi = 0
@@ -344,16 +369,22 @@ Source code for mala.descriptors.bispectrum
timing_gridpoints = 0
# Set up the array holding the bispectrum descriptors.
- ncoeff = (self.parameters.bispectrum_twojmax + 2) * \
- (self.parameters.bispectrum_twojmax + 3) * \
- (self.parameters.bispectrum_twojmax + 4)
- ncoeff = ncoeff // 24 # integer division
+ ncoeff = (
+ (self.parameters.bispectrum_twojmax + 2)
+ * (self.parameters.bispectrum_twojmax + 3)
+ * (self.parameters.bispectrum_twojmax + 4)
+ )
+ ncoeff = ncoeff // 24 # integer division
self.fingerprint_length = ncoeff + 3
- bispectrum_np = np.zeros((self.grid_dimensions[0],
- self.grid_dimensions[1],
- self.grid_dimensions[2],
- self.fingerprint_length),
- dtype=np.float64)
+ bispectrum_np = np.zeros(
+ (
+ self.grid_dimensions[0],
+ self.grid_dimensions[1],
+ self.grid_dimensions[2],
+ self.fingerprint_length,
+ ),
+ dtype=np.float64,
+ )
# Create a list of all potentially relevant atoms.
all_atoms = self._setup_atom_list()
@@ -424,8 +455,9 @@ Source code for mala.descriptors.bispectrum
# Compute the grid point.
if profile_calculation:
t_grid = time.time()
- bispectrum_np[x, y, z, 0:3] = \
- self._grid_to_coord([x, y, z])
+ bispectrum_np[x, y, z, 0:3] = self._grid_to_coord(
+ [x, y, z]
+ )
########
# Distance matrix calculation.
@@ -436,15 +468,30 @@ Source code for mala.descriptors.bispectrum
if profile_calculation:
t0 = time.time()
- distances = np.squeeze(distance.cdist(
- [bispectrum_np[x, y, z, 0:3]],
- all_atoms))
- distances_cutoff = np.squeeze(np.abs(
- distances[np.argwhere(
- distances < self.parameters.bispectrum_cutoff)]))
- atoms_cutoff = np.squeeze(all_atoms[np.argwhere(
- distances < self.parameters.bispectrum_cutoff), :],
- axis=1)
+ distances = np.squeeze(
+ distance.cdist(
+ [bispectrum_np[x, y, z, 0:3]], all_atoms
+ )
+ )
+ distances_cutoff = np.squeeze(
+ np.abs(
+ distances[
+ np.argwhere(
+ distances
+ < self.parameters.bispectrum_cutoff
+ )
+ ]
+ )
+ )
+ atoms_cutoff = np.squeeze(
+ all_atoms[
+ np.argwhere(
+ distances < self.parameters.bispectrum_cutoff
+ ),
+ :,
+ ],
+ axis=1,
+ )
nr_atoms = np.shape(atoms_cutoff)[0]
if profile_calculation:
timing_distances += time.time() - t0
@@ -458,10 +505,12 @@ Source code for mala.descriptors.bispectrum
if profile_calculation:
t0 = time.time()
- ulisttot_r, ulisttot_i = \
- self.__compute_ui(nr_atoms, atoms_cutoff,
- distances_cutoff,
- bispectrum_np[x, y, z, 0:3])
+ ulisttot_r, ulisttot_i = self.__compute_ui(
+ nr_atoms,
+ atoms_cutoff,
+ distances_cutoff,
+ bispectrum_np[x, y, z, 0:3],
+ )
if profile_calculation:
timing_ui += time.time() - t0
@@ -474,8 +523,9 @@ Source code for mala.descriptors.bispectrum
if profile_calculation:
t0 = time.time()
- zlist_r, zlist_i = \
- self.__compute_zi(ulisttot_r, ulisttot_i)
+ zlist_r, zlist_i = self.__compute_zi(
+ ulisttot_r, ulisttot_i
+ )
if profile_calculation:
timing_zi += time.time() - t0
@@ -487,9 +537,9 @@ Source code for mala.descriptors.bispectrum
########
if profile_calculation:
t0 = time.time()
- bispectrum_np[x, y, z, 3:] = \
- self.__compute_bi(ulisttot_r, ulisttot_i, zlist_r,
- zlist_i)
+ bispectrum_np[x, y, z, 3:] = self.__compute_bi(
+ ulisttot_r, ulisttot_i, zlist_r, zlist_i
+ )
if profile_calculation:
timing_gridpoints += time.time() - t_grid
timing_bi += time.time() - t0
@@ -499,13 +549,27 @@ Source code for mala.descriptors.bispectrum
print("Python-based bispectrum descriptor calculation timing: ")
print("Index matrix initialization [s]", timing_index_init)
print("Overall calculation time [s]", timing_total)
- print("Calculation time per gridpoint [s/gridpoint]",
- timing_gridpoints / np.prod(self.grid_dimensions))
+ print(
+ "Calculation time per gridpoint [s/gridpoint]",
+ timing_gridpoints / np.prod(self.grid_dimensions),
+ )
print("Timing contributions per gridpoint: ")
- print("Distance matrix [s/gridpoint]", timing_distances/np.prod(self.grid_dimensions))
- print("Compute ui [s/gridpoint]", timing_ui/np.prod(self.grid_dimensions))
- print("Compute zi [s/gridpoint]", timing_zi/np.prod(self.grid_dimensions))
- print("Compute bi [s/gridpoint]", timing_bi/np.prod(self.grid_dimensions))
+ print(
+ "Distance matrix [s/gridpoint]",
+ timing_distances / np.prod(self.grid_dimensions),
+ )
+ print(
+ "Compute ui [s/gridpoint]",
+ timing_ui / np.prod(self.grid_dimensions),
+ )
+ print(
+ "Compute zi [s/gridpoint]",
+ timing_zi / np.prod(self.grid_dimensions),
+ )
+ print(
+ "Compute bi [s/gridpoint]",
+ timing_bi / np.prod(self.grid_dimensions),
+ )
if self.parameters.descriptors_contain_xyz:
return bispectrum_np, np.prod(self.grid_dimensions)
@@ -558,9 +622,12 @@ Source code for mala.descriptors.bispectrum
def deltacg(j1, j2, j):
sfaccg = np.math.factorial((j1 + j2 + j) // 2 + 1)
- return np.sqrt(np.math.factorial((j1 + j2 - j) // 2) *
- np.math.factorial((j1 - j2 + j) // 2) *
- np.math.factorial((-j1 + j2 + j) // 2) / sfaccg)
+ return np.sqrt(
+ np.math.factorial((j1 + j2 - j) // 2)
+ * np.math.factorial((j1 - j2 + j) // 2)
+ * np.math.factorial((-j1 + j2 + j) // 2)
+ / sfaccg
+ )
########
# Indices for compute_ui.
@@ -576,23 +643,40 @@ Source code for mala.descriptors.bispectrum
idxu_count += 1
self.__index_u_max = idxu_count
- rootpqarray = np.zeros((self.parameters.bispectrum_twojmax + 2,
- self.parameters.bispectrum_twojmax + 2))
+ rootpqarray = np.zeros(
+ (
+ self.parameters.bispectrum_twojmax + 2,
+ self.parameters.bispectrum_twojmax + 2,
+ )
+ )
for p in range(1, self.parameters.bispectrum_twojmax + 1):
- for q in range(1,
- self.parameters.bispectrum_twojmax + 1):
+ for q in range(1, self.parameters.bispectrum_twojmax + 1):
rootpqarray[p, q] = np.sqrt(p / q)
# These are only for optimization purposes.
self.__index_u_one_initialized = None
for j in range(0, self.parameters.bispectrum_twojmax + 1):
- stop = self.__index_u_block[j + 1] if j < self.parameters.bispectrum_twojmax else self.__index_u_max
+ stop = (
+ self.__index_u_block[j + 1]
+ if j < self.parameters.bispectrum_twojmax
+ else self.__index_u_max
+ )
if self.__index_u_one_initialized is None:
- self.__index_u_one_initialized = np.arange(self.__index_u_block[j], stop=stop, step=j + 2)
+ self.__index_u_one_initialized = np.arange(
+ self.__index_u_block[j], stop=stop, step=j + 2
+ )
else:
- self.__index_u_one_initialized = np.concatenate((self.__index_u_one_initialized,
- np.arange(self.__index_u_block[j], stop=stop, step=j + 2)))
- self.__index_u_one_initialized = self.__index_u_one_initialized.astype(np.int32)
+ self.__index_u_one_initialized = np.concatenate(
+ (
+ self.__index_u_one_initialized,
+ np.arange(
+ self.__index_u_block[j], stop=stop, step=j + 2
+ ),
+ )
+ )
+ self.__index_u_one_initialized = self.__index_u_one_initialized.astype(
+ np.int32
+ )
self.__index_u_full = []
self.__index_u_symmetry_pos = []
self.__index_u_symmetry_neg = []
@@ -646,8 +730,11 @@ Source code for mala.descriptors.bispectrum
idxz_count = 0
for j1 in range(self.parameters.bispectrum_twojmax + 1):
for j2 in range(j1 + 1):
- for j in range(j1 - j2, min(self.parameters.bispectrum_twojmax,
- j1 + j2) + 1, 2):
+ for j in range(
+ j1 - j2,
+ min(self.parameters.bispectrum_twojmax, j1 + j2) + 1,
+ 2,
+ ):
for mb in range(j // 2 + 1):
for ma in range(j + 1):
idxz_count += 1
@@ -655,15 +742,22 @@ Source code for mala.descriptors.bispectrum