diff --git a/docs/2.0.x/_modules/dscribe/descriptors/descriptorlocal.html b/docs/2.0.x/_modules/dscribe/descriptors/descriptorlocal.html index fd5c9fea..2492c341 100644 --- a/docs/2.0.x/_modules/dscribe/descriptors/descriptorlocal.html +++ b/docs/2.0.x/_modules/dscribe/descriptors/descriptorlocal.html @@ -276,7 +276,10 @@
import ase.geometry.cell
import ase.data
+from dscribe.utils.species import get_atomic_numbers
from dscribe.descriptors.descriptorlocal import DescriptorLocal
import dscribe.ext
@@ -141,8 +142,8 @@ Source code for dscribe.descriptors.soap
sigma=1.0,
rbf="gto",
weighting=None,
- crossover=True,
average="off",
+ compression={"mode": "off", "species_weighting": None},
species=None,
periodic=False,
sparse=False,
@@ -216,19 +217,42 @@ Source code for dscribe.descriptors.soap
function is also specified, this constant will override it
for the central atoms.
- crossover (bool): Determines if crossover of atomic types should
- be included in the power spectrum. If enabled, the power
- spectrum is calculated over all unique species combinations Z
- and Z'. If disabled, the power spectrum does not contain
- cross-species information and is only run over each unique
- species Z. Turned on by default to correspond to the original
- definition
average (str): The averaging mode over the centers of interest.
Valid options are:
* ``"off"``: No averaging.
* ``"inner"``: Averaging over sites before summing up the magnetic quantum numbers: :math:`p_{nn'l}^{Z_1,Z_2} \sim \sum_m (\\frac{1}{n} \sum_i c_{nlm}^{i, Z_1})^{*} (\\frac{1}{n} \sum_i c_{n'lm}^{i, Z_2})`
* ``"outer"``: Averaging over the power spectrum of different sites: :math:`p_{nn'l}^{Z_1,Z_2} \sim \\frac{1}{n} \sum_i \sum_m (c_{nlm}^{i, Z_1})^{*} (c_{n'lm}^{i, Z_2})`
+ compression (dict): Contains the options which specify the feature compression to apply.
+ Applying compression can slightly reduce the accuracy of models trained on the feature
+ representation but can also dramatically reduce the size of the feature vector
+ and hence the computational cost. Options are:
+
+ * ```"mode"```: Specifies the type of compression. This can be one of:
+ * ``"off"``: No compression; default.
+ * ``"mu2"``: The SOAP feature vector is generated in an element-agnostic way, so that
+ the size of the feature vector is now independent of the number of elements (see Darby et al
+ below for details). It is still possible when using this option to construct a feature
+ vector that distinguishes between elements by supplying element-specific weighting under
+ "species_weighting", see below.
+ * ``"mu1nu1"``: Implements the mu=1, nu=1 feature compression scheme from Darby et al.: :math:`p_{inn'l}^{Z_1,Z_2} \sum_m (c_{nlm}^{i, Z_1})^{*} (\sum_z c_{n'lm}^{i, z})`.
+ In other words, each coefficient for each species is multiplied by a "species-mu2" sum over the corresponding set of coefficients for all other species.
+ If this option is selected, features are generated for each center, but the number of features (the size of each feature vector) scales linearly rather than
+ quadratically with the number of elements in the system.
+ * ``"crossover"``: The power spectrum does not contain cross-species information
+ and is only run over each unique species Z. In this configuration, the size of
+ the feature vector scales linearly with the number of elements in the system.
+ * ```"species_weighting"```: Either None or a dictionary mapping each species to a
+ species-specific weight. If None, there is no species-specific weighting. If a dictionary,
+ must contain a matching key for each species in the ``species`` iterable.
+ The main use of species weighting is to weight each element differently when using
+ the "mu2" option for ``compression``.
+
+ For reference see:
+ "Darby, J.P., Kermode, J.R. & Csányi, G.
+ Compressing local atomic neighbourhood descriptors.
+ npj Comput Mater 8, 166 (2022). https://doi.org/10.1038/s41524-022-00847-y"
+
species (iterable): The chemical species as a list of atomic
numbers or as a list of chemical symbols. Notice that this is not
the atomic numbers that are present for an individual system, but
@@ -258,6 +282,13 @@ Source code for dscribe.descriptors.soap
# Setup the involved chemical species
self.species = species
+ # If species weighting is supplied, ensure it is valid and set
+ # it up.
+ if "species_weighting" in compression:
+ self.species_weights = compression["species_weighting"]
+ else:
+ self.species_weights = None
+
# Test that general settings are valid
if sigma <= 0:
raise ValueError(
@@ -283,6 +314,15 @@ Source code for dscribe.descriptors.soap
"one of the following: {}".format(average, supported_average)
)
+ supported_compression = set(("off", "mu2", "mu1nu1", "crossover"))
+ if compression["mode"] not in supported_compression:
+ raise ValueError(
+ "Invalid compression mode '{}' given. Please use "
+ "one of the following: {}".format(
+ compression["mode"], supported_compression
+ )
+ )
+
if not (weighting or r_cut):
raise ValueError("Either weighting or r_cut need to be defined")
if weighting:
@@ -358,7 +398,7 @@ Source code for dscribe.descriptors.soap
self._l_max = l_max
self._rbf = rbf
self.average = average
- self.crossover = crossover
+ self.compression = compression["mode"]
[docs] def prepare_centers(self, system, centers=None):
"""Validates and prepares the centers for the C++ extension."""
@@ -503,7 +543,7 @@ Source code for dscribe.descriptors.soap
# Determine if the outputs have a fixed size
n_features = self.get_number_of_features()
static_size = None
- if self.average == "outer" or self.average == "inner":
+ if self.average != "off":
static_size = [n_features]
else:
if centers is None:
@@ -581,13 +621,14 @@ Source code for dscribe.descriptors.soap
self._l_max,
self._eta,
self._weighting,
- self.crossover,
self.average,
cutoff_padding,
alphas,
betas,
self._atomic_numbers,
+ self.species_weights,
self.periodic,
+ self.compression,
)
# Calculate analytically with extension
@@ -612,13 +653,14 @@ Source code for dscribe.descriptors.soap
self._l_max,
self._eta,
self._weighting,
- self.crossover,
self.average,
cutoff_padding,
rx,
gss,
self._atomic_numbers,
+ self.species_weights,
self.periodic,
+ self.compression,
)
soap_poly.create(
soap_mat,
@@ -723,15 +765,15 @@ Source code for dscribe.descriptors.soap
self._l_max,
self._eta,
self._weighting,
- self.crossover,
self.average,
cutoff_padding,
alphas,
betas,
self._atomic_numbers,
+ self.species_weights,
self.periodic,
+ self.compression,
)
-
# Calculate numerically with extension
soap_gto.derivatives_numerical(
d,
@@ -757,13 +799,14 @@ Source code for dscribe.descriptors.soap
self._l_max,
self._eta,
self._weighting,
- self.crossover,
self.average,
cutoff_padding,
rx,
gss,
self._atomic_numbers,
+ self.species_weights,
self.periodic,
+ self.compression,
)
soap_poly.derivatives_numerical(
d,
@@ -826,13 +869,14 @@ Source code for dscribe.descriptors.soap
self._l_max,
self._eta,
self._weighting,
- self.crossover,
self.average,
cutoff_padding,
alphas,
betas,
self._atomic_numbers,
+ self.species_weights,
self.periodic,
+ self.compression,
)
# These arrays are only used internally by the C++ code.
@@ -892,6 +936,60 @@ Source code for dscribe.descriptors.soap
self.index_to_atomic_number[i_atom] = atomic_number
self.n_elements = len(self._atomic_numbers)
+ @property
+ def species_weights(self):
+ return self._species_weights
+
+ @species_weights.setter
+ def species_weights(self, value):
+ """Used to check the validity of species weighting and set it up.
+ Note that species must already be set up in order to set species
+ weighting.
+
+ Args:
+ value(iterable): Chemical species either as a list of atomic
+ numbers or list of chemical symbols.
+ """
+ if value is None:
+ self._species_weights = np.ones((self.n_elements))
+ else:
+ if not isinstance(value, dict):
+ raise ValueError(
+ "Invalid species weighting '{}' given. Species weighting must "
+ "be either None or a dict.".format(value)
+ )
+
+ if len(value) != self.n_elements:
+ raise ValueError(
+ "The species_weighting dictionary, "
+ "if supplied, must contain the same keys as "
+ "the list of accepted species."
+ )
+ species_weights = []
+ for specie in list(self.species):
+ if specie not in value:
+ raise ValueError(
+ "The species_weighting dictionary, "
+ "if supplied, must contain the same keys as "
+ "the list of accepted species."
+ )
+ if isinstance(specie, (int, np.integer)):
+ if specie <= 0:
+ raise ValueError(
+ "Species weighting {} contained a zero or negative "
+ "atomic number.".format(value)
+ )
+ species_weights.append((value[specie], specie))
+ else:
+ species_weights.append(
+ (value[specie], ase.data.atomic_numbers.get(specie))
+ )
+
+ species_weights = [
+ s[0] for s in sorted(species_weights, key=lambda x: x[1])
+ ]
+ self._species_weights = np.array(species_weights).astype(np.float64)
+
[docs] def get_number_of_features(self):
"""Used to inquire the final number of features that this descriptor
will have.
@@ -900,11 +998,15 @@ Source code for dscribe.descriptors.soap
int: Number of features for this descriptor.
"""
n_elem = len(self._atomic_numbers)
- if self.crossover:
- n_elem_radial = n_elem * self._n_max
- return int((n_elem_radial) * (n_elem_radial + 1) / 2 * (self._l_max + 1))
- else:
- return int(n_elem * self._n_max * (self._n_max + 1) / 2 * (self._l_max + 1))
+ if self.compression == "mu2":
+ return int((self._n_max) * (self._n_max + 1) * (self._l_max + 1) / 2)
+ elif self.compression == "mu1nu1":
+ return int(self._n_max**2 * n_elem * (self._l_max + 1))
+
+ elif self.compression == "crossover":
+ return int(n_elem * self._n_max * (self._n_max + 1) / 2 * (self._l_max + 1))
+ n_elem_radial = n_elem * self._n_max
+ return int((n_elem_radial) * (n_elem_radial + 1) / 2 * (self._l_max + 1))
[docs] def get_location(self, species):
"""Can be used to query the location of a species combination in the
@@ -952,9 +1054,8 @@ Source code for dscribe.descriptors.soap
numbers = list(reversed(numbers))
i = numbers[0]
j = numbers[1]
- n_elem_feat_symm = self._n_max * (self._n_max + 1) / 2 * (self._l_max + 1)
-
- if self.crossover:
+ if self.compression == "off":
+ n_elem_feat_symm = self._n_max * (self._n_max + 1) / 2 * (self._l_max + 1)
n_elem_feat_unsymm = self._n_max * self._n_max * (self._l_max + 1)
n_elem_feat = n_elem_feat_symm if i == j else n_elem_feat_unsymm
@@ -965,10 +1066,20 @@ Source code for dscribe.descriptors.soap
start = int(m_symm * n_elem_feat_symm + m_unsymm * n_elem_feat_unsymm)
end = int(start + n_elem_feat)
- else:
+ elif self.compression == "mu2":
+ n_elem_feat_symm = self._n_max * (self._n_max + 1) * (self._l_max + 1) / 2
+ start = 0
+ end = int(0 + n_elem_feat_symm)
+ elif self.compression in ["mu1nu1", "crossover"]:
+ n_elem_feat_symm = self._n_max**2 * (self._l_max + 1)
+ if self.compression == "crossover":
+ n_elem_feat_symm = (
+ self._n_max * (self._n_max + 1) * (self._l_max + 1) / 2
+ )
if i != j:
raise ValueError(
- "Crossover is set to False. No cross-species output " "available"
+ "Compression has been selected. "
+ "No cross-species output available"
)
start = int(i * n_elem_feat_symm)
end = int(start + n_elem_feat_symm)
@@ -1218,4 +1329,4 @@ Source code for dscribe.descriptors.soap