From 74f2d8c42bad8b312f83b9ff01033b8fd2dc0fd7 Mon Sep 17 00:00:00 2001 From: JosePizarro3 Date: Wed, 17 Jan 2024 15:26:36 +0100 Subject: [PATCH] Moved schema definition from the central nomad --- simulationdataschema/__init__.py | 2 +- simulationdataschema/general.py | 316 ++++--------- simulationdataschema/system.py | 740 +++++++++++++++++++++++++++++++ 3 files changed, 823 insertions(+), 235 deletions(-) create mode 100644 simulationdataschema/system.py diff --git a/simulationdataschema/__init__.py b/simulationdataschema/__init__.py index caf955d4..7761db82 100644 --- a/simulationdataschema/__init__.py +++ b/simulationdataschema/__init__.py @@ -16,4 +16,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from .general import SimulationWorkflow, SimulationWorkflowMethod, SimulationWorkflowResults +from .general import Computation \ No newline at end of file diff --git a/simulationdataschema/general.py b/simulationdataschema/general.py index 8be564af..c0d062e3 100644 --- a/simulationdataschema/general.py +++ b/simulationdataschema/general.py @@ -15,240 +15,88 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import List +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. +# See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# from nomad.metainfo import Environment +# from .run import Run +# from .calculation import Calculation +# from .method import Method +# from .system import System +# from . import run +# from . import method +# from . import calculation +# from . import system + +# m_env = Environment() +# m_env.m_add_sub_section(Environment.packages, run.m_package) +# m_env.m_add_sub_section(Environment.packages, method.m_package) +# m_env.m_add_sub_section(Environment.packages, calculation.m_package) +# m_env.m_add_sub_section(Environment.packages, system.m_package) import numpy as np -from nomad.datamodel.data import ArchiveSection -from nomad.metainfo import SubSection, Section, Quantity, Reference -from nomad.datamodel.metainfo.common import FastAccess -from nomad.datamodel.metainfo.workflow import Workflow, Link, Task -from nomad.datamodel.metainfo.simulation.method import Method -from nomad.datamodel.metainfo.simulation.system import System -from nomad.datamodel.metainfo.simulation.calculation import Calculation - - -def resolve_difference(values): - delta_values = None - - for n in range(-1, -len(values), -1): - a = values[n] - b = values[n - 1] - if a is None or b is None: - continue - delta_values = abs(a - b) - if delta_values != 0.0: - break - - return delta_values - - -INPUT_SYSTEM_NAME = 'Input system' -OUTPUT_SYSTEM_NAME = 'Output system' -INPUT_METHOD_NAME = 'Input method' -INPUT_CALCULATION_NAME = 'Input calculation' -OUTPUT_CALCULATION_NAME = 'Output calculation' -WORKFLOW_METHOD_NAME = 'Workflow method' -WORKFLOW_RESULTS_NAME = 'Workflow results' - - -class SimulationWorkflowMethod(ArchiveSection): - - m_def = Section(validate=False) - - def normalize(self, archive, logger): - pass - - -class SimulationWorkflowResults(ArchiveSection): - - m_def = Section(validate=False) - - calculation_result_ref = Quantity( - type=Reference(Calculation.m_def), - shape=[], - description=''' - Reference to calculation result. In the case of serial workflows, this corresponds - to the final step in the simulation. For the parallel case, it refers to the reference calculation. - ''', - categories=[FastAccess]) - - n_calculations = Quantity( - type=int, - shape=[], - description=''' - Number of calculations in workflow. - ''') - - calculations_ref = Quantity( - type=Reference(Calculation.m_def), - shape=['n_calculations'], - description=''' - List of references to each calculation section in the simulation. - ''') - - def normalize(self, archive, logger): - calculations = [] - try: - calculations = archive.run[0].calculation - except Exception: - return - - if not calculations: - return - - if not self.calculation_result_ref: - self.calculation_result_ref = calculations[-1] - - if not self.calculations_ref: - self.calculations_ref = calculations - - -class SimulationWorkflow(Workflow): - - method = SubSection(sub_section=SimulationWorkflowMethod) - - results = SubSection(sub_section=SimulationWorkflowResults, categories=[FastAccess]) - - def normalize(self, archive, logger): - super().normalize(archive, logger) - - self._calculations: List[Calculation] = [] - self._systems: List[System] = [] - self._methods: List[Method] = [] - try: - self._calculations = archive.run[-1].calculation - self._systems = archive.run[-1].system - self._methods = archive.run[-1].method - except Exception: - logger.warning('System, method and calculation required for normalization.') - pass - - if not self._calculations: +from nomad.datamodel import EntryArchive +from nomad.datamodel.data import EntryData +from nomad.datamodel.metainfo.basesections import Computation as BaseComputation +from nomad.metainfo import SubSection +from .system import ModelSystem + + +class Computation(BaseComputation, EntryData): + ''' + ''' + # m_def = Section(extends_base_section=True) + model_system = SubSection(sub_section=ModelSystem.m_def, repeats=True) + # method = SubSection( + # sub_section=Method.m_def, + # description=''' + # The input methodological parameters used for the computation. + # ''', + # repeats=True, + # ) + # calculation = SubSection( + # sub_section=Calculation.m_def, + # description=''' + # The output of a computation. It can reference a specific system and method section. + # ''', + # repeats=True, + # ) + + def _set_system_tree_index(self, system_parent: ModelSystem, tree_index: np.int32 = 0): + for system_child in system_parent.model_system: + system_child.tree_index = tree_index + 1 + self._set_system_tree_index(system_child, tree_index + 1) + + def normalize(self, archive: EntryArchive, logger) -> None: + super(EntryData, self).normalize(archive, logger) + + # Finding which is the representative system of a calculation: typically, we will + # define it as the last system reported (CHECK THIS!). + # TODO extend adding the proper representative system extraction using `normalizer.py` + if len(self.model_system) == 0: + logger.error('No system information reported.') return - - if not self.inputs: - if self._systems: - self.m_add_sub_section( - Workflow.inputs, Link(name=INPUT_SYSTEM_NAME, section=self._systems[0])) - - if self.method is not None: - self.m_add_sub_section( - Workflow.inputs, Link(name=WORKFLOW_METHOD_NAME, section=self.method)) - - for link in self.inputs: - if isinstance(link.section, System): - self.input_structure = link.section - break - - if not self.outputs: - if self._calculations: - self.m_add_sub_section( - Workflow.outputs, Link(name=OUTPUT_CALCULATION_NAME, section=self._calculations[-1])) - - if self.results is not None: - self.m_add_sub_section( - Workflow.outputs, Link(name=WORKFLOW_RESULTS_NAME, section=self.results)) - - if not self.tasks: - current_step = 1 - current_time = None - inputs = [] - add_inputs, add_outputs = False, False - tasks = [] - for n, calc in enumerate(self._calculations): - if calc.time_physical is None or calc.time_calculation is None: - # all calculation sections should have time info inorder to create workflow tasks - tasks = [] - break - - task = Task(outputs=[Link(name=OUTPUT_CALCULATION_NAME, section=calc)]) - # successive tasks in serial if overlap in time duration exists - if current_time is None: - current_time = calc.time_physical - start_time = calc.time_physical - calc.time_calculation - if start_time and (start_time > current_time or np.isclose(start_time, current_time)): - task.inputs = inputs - inputs = [] - current_step += 1 - else: - if tasks: - for input in tasks[-1].inputs: - if input.name == INPUT_METHOD_NAME: - continue - task.m_add_sub_section(Task.inputs, input) - if calc.method_ref: - task.m_add_sub_section(Task.inputs, Link(name=INPUT_METHOD_NAME, section=calc.method_ref)) - if calc.system_ref: - inputs.append(Link(name=INPUT_SYSTEM_NAME, section=calc.system_ref)) - task.m_add_sub_section(Task.outputs, Link(name=OUTPUT_SYSTEM_NAME, section=calc.system_ref)) - else: - inputs.append(Link(name=INPUT_CALCULATION_NAME, section=calc)) - task.name = f'Step {current_step}' - tasks.append(task) - current_time = max(current_time, calc.time_physical) - # add input if first calc in tasks - add_inputs = add_inputs or n == 0 - # add output if last calc in tasks - add_outputs = add_outputs or n == len(self._calculations) - 1 - for task in tasks: - # add workflow inputs to first parallel tasks - if task.name == 'Step 1' and add_inputs: - task.inputs.extend([input for input in self.inputs if input not in task.inputs]) - # add outputs of last parallel tasks to workflow outputs - if task.name == f'Step {current_step}' and add_outputs: - self.outputs.extend([output for output in task.outputs if output not in self.outputs]) - - self.tasks = tasks - - - -class ParallelSimulation(SimulationWorkflow): - def normalize(self, archive, logger): - super().normalize(archive, logger) - - if not self.tasks: - for n, calculation in enumerate(self._calculations): - inputs, outputs = [], [Link(name=OUTPUT_CALCULATION_NAME, section=calculation)] - if self._calculations[n].system_ref: - inputs.append(Link(name=INPUT_SYSTEM_NAME, section=self._calculations[n].system_ref)) - elif len(self._calculations) == len(self._systems): - inputs.append(Link(name=INPUT_SYSTEM_NAME, section=self._systems[n])) - else: - continue - if self._calculations[n].method_ref: - inputs.append(Link(name=INPUT_METHOD_NAME, section=self._calculations[n].method_ref)) - elif len(self._calculations) == len(self._methods): - inputs.append(Link(name=INPUT_METHOD_NAME, section=self._methods[n])) - elif len(self._methods) == 1: - inputs.append(Link(name=INPUT_METHOD_NAME, section=self._methods[0])) - self.tasks.append(Task(name=f'Calculation {n}', inputs=inputs, outputs=outputs)) - - -class SerialSimulation(SimulationWorkflow): - def normalize(self, archive, logger): - super().normalize(archive, logger) - - if not self.tasks: - previous_structure = None - for n, calculation in enumerate(self._calculations): - inputs, outputs = [], [Link(name=OUTPUT_CALCULATION_NAME, section=calculation)] - if calculation.system_ref: - input_structure = self.input_structure if n == 0 else self._calculations[n - 1].system_ref - if not input_structure: - input_structure = previous_structure - if input_structure: - inputs.append(Link(name=INPUT_SYSTEM_NAME, section=input_structure)) - previous_structure = calculation.system_ref - outputs.append(Link(name=OUTPUT_SYSTEM_NAME, section=calculation.system_ref)) - elif len(self._calculations) == len(self._systems): - inputs.append(Link(name=INPUT_SYSTEM_NAME, section=self.input_structure if n == 0 else self._systems[n - 1])) - outputs.append(Link(name=OUTPUT_SYSTEM_NAME, section=self._systems[n])) - else: - continue - if calculation.method_ref: - inputs.append(Link(name=INPUT_METHOD_NAME, section=calculation.method_ref)) - elif len(self._calculations) == len(self._methods): - inputs.append(Link(name=INPUT_METHOD_NAME, section=self._methods[n])) - elif len(self._methods) == 1: - inputs.append(Link(name=INPUT_METHOD_NAME, section=self._methods[0])) - self.tasks.append(Task(name=f'Step {n}', inputs=inputs, outputs=outputs)) + system_ref = self.model_system[-1] + system_ref.is_representative = True + + # Setting up the `tree_index` in the parent-child tree + for system_parents in self.model_system: + system_parents.tree_index = 0 + if len(system_parents.model_system) == 0: + continue + self._set_system_tree_index(system_parents) diff --git a/simulationdataschema/system.py b/simulationdataschema/system.py new file mode 100644 index 00000000..c78058e3 --- /dev/null +++ b/simulationdataschema/system.py @@ -0,0 +1,740 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. +# See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import re +import numpy as np +import ase + +from matid import SymmetryAnalyzer, Classifier # pylint: disable=import-error +from matid.classifications import Class0D, Atom, Class1D, Class2D, Material2D, Surface, Class3D # pylint: disable=import-error + +from nomad import config +from nomad.units import ureg +from nomad.atomutils import Formula, get_normalized_wyckoff, search_aflow_prototype + +from nomad.datamodel.data import ArchiveSection +from nomad.datamodel import EntryArchive +from nomad.metainfo import Quantity, SubSection, SectionProxy, MEnum +from nomad.datamodel.metainfo.basesections import System, RealSpace +from nomad.datamodel.metainfo.annotations import ELNAnnotation + + +class AtomicCell(RealSpace): + ''' + A base section used to specify the atomic cell quantities (labels, positions) of a system + at a given moment in time. + ''' + + name = Quantity( + type=MEnum('original', 'primitive', 'standard'), + description=''' + Name to identify the cell structure. It might be: + - 'original' as in orignally parsed, + - 'primitive' as the primitive unit cell, + - 'standard' as the standarized cell used for referencing. + ''', + ) + + n_atoms = Quantity( + type=np.int32, + description=''' + The total number of atoms in the system. + ''', + ) + + labels = Quantity( + type=str, + shape=['n_atoms'], + description=''' + List containing the labels of the atomic species in the system at the different positions + of the structure. It refers to a chemical element as defined in the periodic table, + e.g., 'H', 'O', 'Pt'. This quantity is equivalent to `atomic_numbers`. + ''', + ) + + atomic_numbers = Quantity( + type=np.int32, + shape=['n_atoms'], + description=''' + List of atomic numbers Z. This quantity is equivalent to `labels`. + ''', + ) + + positions = Quantity( + type=np.float64, + shape=['n_atoms', 3], + unit='meter', + description=''' + Positions of all the atoms in Cartesian coordinates. + ''', + ) + + lattice_vectors = Quantity( + type=np.float64, + shape=[3, 3], + unit='meter', + description=''' + Lattice vectors of the simulated cell in Cartesian coordinates. The first index runs + over each lattice vector. The second index runs over the $x, y, z$ Cartesian coordinates. + ''', + ) + + lattice_vectors_reciprocal = Quantity( + type=np.float64, + shape=[3, 3], + unit='1/meter', + description=''' + Reciprocal lattice vectors of the simulated cell, in Cartesian coordinates and + including the $2 pi$ pre-factor. The first index runs over each lattice vector. The + second index runs over the $x, y, z$ Cartesian coordinates. + ''', + ) + + periodic_boundary_conditions = Quantity( + type=bool, + shape=[3], + description=''' + If periodic boundary conditions are applied to each direction of the crystal axes. + ''', + ) + + velocities = Quantity( + type=np.float64, + shape=['n_atoms', 3], + unit='meter / second', + description=''' + Velocities of the atoms. It is the change in cartesian coordinates of the atom position + with time. + ''', + ) + + supercell_matrix = Quantity( + type=np.int32, + shape=[3, 3], + description=''' + Specifies the matrix that transforms the primitive unit cell into the supercell in + which the actual calculation is performed. In the easiest example, it is a diagonal + matrix whose elements multiply the `lattice_vectors`, e.g., [[3, 0, 0], [0, 3, 0], [0, 0, 3]] + is a $3 x 3 x 3$ superlattice. + ''', + ) + + equivalent_atoms = Quantity( + type=np.int32, + shape=['n_atoms'], + description=''' + List of equivalent atoms as defined in `labels`. If no equivalent atoms are found, + then the list is simply the index of each element, e.g.: + - [0, 1, 2, 3] all four atoms are non-equivalent. + - [0, 0, 0, 3] three equivalent atoms and one non-equivalent. + ''', + ) + + wyckoff_letters = Quantity( + type=str, + shape=['n_atoms'], + # TODO improve description + description=''' + Wyckoff letters associated with each atom position. + ''', + ) + + def normalize(self, archive: EntryArchive, logger) -> None: + # Check if AtomicCell section exists + if self is None: + logger.error('Could not find the basic System.atomic_cell information.') + + # Resolving atom_labels (either directly or from atomic_numbers) + atom_labels = self.labels + atomic_numbers = self.atomic_numbers + if atom_labels is None: + if atomic_numbers is None: + logger.error('System.atomic_cell has neither labels nor atomic_numbers defined.') + return + try: + atom_labels = [ase.data.chemical_symbols[number] for number in atomic_numbers] + except IndexError: + logger.error('System.atomic_cell has atomic_numbers that are out of range of the periodic table.') + return + self.labels = atom_labels + self.n_atoms = len(atom_labels) + + # Using ASE functionalities to write atomic_numbers + ase_atoms = ase.Atoms(symbols=atom_labels) + chemical_symbols = ase_atoms.get_chemical_symbols() + if atom_labels != list(chemical_symbols): + logger.warning('Chemical symbols in System.atomic_cell.labels are ambigous and cannot be ' + 'recognized by ASE.') + if atomic_numbers is None: + atomic_numbers = ase_atoms.get_atomic_numbers() + else: + if atomic_numbers != list(ase_atoms.get_atomic_numbers()): + logger.info('The parsed System.atomic_cell.atomic_numbers do not coincide with ' + 'the ASE extracted numbers from the labels. We will rewrite the parsed data.') + atomic_numbers = ase_atoms.get_atomic_numbers() + self.atomic_numbers = atomic_numbers + + # Periodic boundary conditions + pbc = self.periodic_boundary_conditions + if pbc is None: + pbc = [False, False, False] + logger.info('Could not find System.atomic_cell.periodic_boundary_conditions information. ' + 'Setting them to False.') + self.periodic_boundary_conditions = pbc + ase_atoms.set_pbc(pbc) + + # Atom positions + atom_positions = self.positions + if atom_positions is None or len(atom_positions) == 0: + logger.error('Could not find System.atomic_cell.positions.') + return + if len(atom_positions) != len(atom_labels): + logger.error('Length of System.atomic_cell.positions does not coincide with the length ' + 'of the System.atomic_cell.labels.') + return + ase_atoms.set_positions(atom_positions.to('angstrom').magnitude) + + # Lattice vectors and reciprocal lattice vectors + lattice_vectors = self.lattice_vectors + if lattice_vectors is None: + logger.info('Could not find System.atomic_cell.lattice_vectors.') + else: + ase_atoms.set_cell(lattice_vectors.to('angstrom').magnitude) + lattice_vectors_reciprocal = self.lattice_vectors_reciprocal + if lattice_vectors_reciprocal is None: + self.lattice_vectors_reciprocal = 2 * np.pi * ase_atoms.get_reciprocal_cell() / ureg.angstrom + + # Store temporarily the ase.Atoms object to use in the ModelSystem.normalizer() + self.m_cache['ase_atoms'] = ase_atoms + + def to_ase_atoms(self, nomad_atomic_cell) -> ase.Atoms: + ''' + Generates a ASE Atoms object with the most basic information from the parsed AtomicCell + section (labels, positions, and lattice_vectors). + ''' + ase_atoms = ase.Atoms(symbols=nomad_atomic_cell.labels) + ase_atoms.set_positions(nomad_atomic_cell.positions.to('angstrom').magnitude) + ase_atoms.set_cell(nomad_atomic_cell.lattice_vectors.to('angstrom').magnitude) + return ase_atoms + + +class Symmetry(ArchiveSection): + ''' + A base section used to specify the symmetry of the AtomicCell. This information can + be extracted via normalization using the MatID package, if `AtomicCell` is specified. + ''' + + bravais_lattice = Quantity( + type=str, + description=''' + Bravais lattice in Pearson notation. + + The first lowercase letter identifies the + crystal family: a (triclinic), b (monoclinic), o (orthorhombic), t (tetragonal), + h (hexagonal), c (cubic). + + The second uppercase letter identifies the centring: P (primitive), S (face centered), + I (body centred), R (rhombohedral centring), F (all faces centred). + ''', + ) + + hall_symbol = Quantity( + type=str, + description=''' + Hall symbol for this system describing the minimum number of symmetry operations, + in the form of Seitz matrices, needed to uniquely define a space group. See + https://cci.lbl.gov/sginfo/hall_symbols.html. Examples: + - `F -4 2 3`, + - `-P 4 2`, + - `-F 4 2 3`. + ''', + ) + + point_group_symbol = Quantity( + type=str, + description=''' + Symbol of the crystallographic point group in the Hermann-Mauguin notation. See + https://en.wikipedia.org/wiki/Crystallographic_point_group. Examples: + - `-43m`, + - `4/mmm`, + - `m-3m`. + ''', + ) + + space_group_number = Quantity( + type=np.int32, + description=''' + Specifies the International Union of Crystallography (IUC) space group number of the 3D + space group of this system. See https://en.wikipedia.org/wiki/List_of_space_groups. + Examples: + - `216`, + - `123`, + - `225`. + ''', + ) + + space_group_symbol = Quantity( + type=str, + description=''' + Specifies the International Union of Crystallography (IUC) space group symbol of the 3D + space group of this system. See https://en.wikipedia.org/wiki/List_of_space_groups. + Examples: + - `F-43m`, + - `P4/mmm`, + - `Fm-3m`. + ''', + ) + + strukturbericht_designation = Quantity( + type=str, + description=''' + Classification of the material according to the historically grown and similar crystal + structures ('strukturbericht'). Useful when using altogether with `space_group_symbol`. + Examples: + - `C1B`, `B3`, `C15b`, + - `L10`, `L60`, + - `L21`. + + Extracted from the AFLOW encyclopedia of crystallographic prototypes. + ''', + ) + + prototype_formula = Quantity( + type=str, + description=''' + The formula of the prototypical material for this structure as extracted from the + AFLOW encyclopedia of crystallographic prototypes. It is a string with the chemical + symbols: + - https://aflowlib.org/prototype-encyclopedia/chemical_symbols.html + ''', + ) + + prototype_aflow_id = Quantity( + type=str, + description=''' + The identifier of this structure in the AFLOW encyclopedia of crystallographic prototypes: + http://www.aflowlib.org/prototype-encyclopedia/index.html + ''', + ) + + atomic_cell_ref = Quantity( + type=AtomicCell, + description=''' + Reference to the AtomicCell section that the symmetry refers to. + ''', + a_eln=ELNAnnotation(component='ReferenceEditQuantity'), + ) + + +class ModelSystem(System): + ''' + Model system used as an input for the computation. It inherits from `System` where a set + of sub-sections for the `elemental_composition` is defined. We also define `name` to + refer to all the verbose and user-dependent naming in ModelSystem. + + It is composed of the sub-sections: AtomicCell (atomic structure quantities), Symmetry + (symmetry of the ModelSystem and which always references the 'standard' AtomicCell). + + This class nest over itself (with the proxy in `model_system`) to define different + parent-child system trees. + + The time evolution of the system is encoded on the fact that ModelSystem is a list under + computation, and for each element of that list, `time_step` can be defined. + + The normalization is ran in the following order: + 1. `AtomicCell.normalize()` from `atomic_cell`, + 2. `ModelSystem.normalize()` in this class. + + Examples: + + - Example 1, a crystal Si has: 3 AtomicCell sections (named 'original', 'primitive', + and 'standard'), 1 Symmetry, and 0 nested ModelSystem trees. + + - Example 2, an heterostructure Si/GaAs has: 1 parent ModelSystem (for Si/GaAs together) + and 2 nested child ModelSystem sections (for Si and GaAs); each child has 3 AtomicCell + sections and 1 Symmetry section. The parent ModelSystem could also have 3 AtomicCell + and 1 Symmetry section (if it is possible to extract them). + + - Example 3, a solution of C800H3200Cu has: 1 parent ModelSystem (for 800*(CH4)+Cu) + and 2 nested child ModelSystem sections (for CH4 and Cu); each child has 1 AtomicCell + section. + + - Example 4, a passivated surface GaAs-CO2 has --> similar to the example 2. + + - Example 5, a passivated heterostructure Si/(GaAs-CO2) has: 1 parent ModelSystem + (for Si/(GaAs-CO2)), 2 child ModelSystems (for Si and GaAs-CO2), and 2 additional + children in one of the childs (for GaAs and CO2). The number of AtomicCell and Symmetry + sections can be inferred using a combination of example 2 and 3. + ''' + + name = Quantity( + type=str, + description=''' + Any verbose naming refering to the ModelSystem. Can be left empty if it is a simple + crystal or it can be filled up. For example, an heterostructure of graphene (G) sandwiched + in between hexagonal boron nitrides (hBN) slabs could be named 'hBN/G/hBN'. + ''', + a_eln=ELNAnnotation(component='StringEditQuantity'), + ) + + # TODO work on improving and extending this quantity and the description + type = Quantity( + type=MEnum('atom', 'molecule / cluster', 'bulk', 'surface', '2D', '1D', 'active_atom', 'unavailable'), + description=''' + Type of the system (atom, bulk, surface, etc.) which is determined by the normalizer. + ''', + ) + + dimensionality = Quantity( + type=MEnum('0D', '1D', '2D', '3D', 'unavailable'), + description=''' + Dimensionality of the system. For atomistic systems this is automatically evaluated + by using the topology-scaling algorithm: + + https://doi.org/10.1103/PhysRevLett.118.106101. + + | Value | Description | + | --------- | ----------------------- | + | `'0D'` | Points in the space | + | `'1D'` | Periodi in one dimension | + | `'2D'` | Periodic in two dimensions | + | `'3D'` | Periodic in three dimensions | + ''', + ) + + time_step = Quantity( + type=np.int32, + description=''' + Specific time snapshot of the ModelSystem. The time evolution is then encoded + in a list of ModelSystems under Computation where for each element this quantity defines + the time step. + ''', + ) + + chemical_formula_descriptive = Quantity( + type=str, + description=''' + The chemical formula of the system as a string to be descriptive of the computation. + It is derived from `elemental_composition` if not specified, with non-reduced integer + numbers for the proportions of the elements. + ''', + ) + + chemical_formula_reduced = Quantity( + type=str, + description=''' + Alphabetically sorted chemical formula with reduced integer chemical proportion + numbers. The proportion number is omitted if it is 1. + ''', + ) + + chemical_formula_iupac = Quantity( + type=str, + description=''' + Chemical formula where the elements are ordered using a formal list based on + electronegativity as defined in the IUPAC nomenclature of inorganic chemistry (2005): + + - https://en.wikipedia.org/wiki/List_of_inorganic_compounds + + Contains reduced integer chemical proportion numbers where the proportion number + is omitted if it is 1. + ''', + ) + + chemical_formula_hill = Quantity( + type=str, + description=''' + Chemical formula where Carbon is placed first, then Hydrogen, and then all the other + elements in alphabetical order. If Carbon is not present, the order is alphabetical. + ''', + ) + + chemical_formula_anonymous = Quantity( + type=str, + description=''' + Formula with the elements ordered by their reduced integer chemical proportion + number, and the chemical species replaced by alphabetically ordered letters. The + proportion number is omitted if it is 1. + + Examples: H2O becomes A2B and H2O2 becomes AB. The letters are drawn from the English + alphabet that may be extended by increasing the number of letters: A, B, ..., Z, Aa, Ab + and so on. This definition is in line with the similarly named OPTIMADE definition. + ''', + ) + + atomic_cell = SubSection(sub_section=AtomicCell.m_def, repeats=True) + + symmetry = SubSection(sub_section=Symmetry.m_def, repeats=True) + + is_representative = Quantity( + type=bool, + default=False, + description=''' + If the model system section is the one representative of the computational simulation. + Defaults to False and set to True by the `Computation.normalize()`. If set to True, + the `ModelSystem.normalize()` function is ran (otherwise, it is not). + ''', + ) + + # TODO what about `branch_label`? + tree_label = Quantity( + type=str, + shape=[], + description=''' + Label of the specific branch in the system tree. + ''') + + # TODO what about `branch_index`? + tree_index = Quantity( + type=np.int32, + description=''' + Index refering to the depth of a branch in the system tree. + ''', + ) + + atom_indices = Quantity( + type=np.int32, + shape=['*'], + description=''' + Indices of the atoms in the child with respect to its parent. Example: + - We have SrTiO3, where `AtomicCell.labels = ['Sr', 'Ti', 'O', 'O', 'O']`. If + we create a `model_system` child for the `'Ti'` atom only, then in that child + `ModelSystem.model_system.atom_indices = [1]`. If now we want to refer both to + the `'Ti'` and the last `'O'` atoms, `ModelSystem.model_system.atom_indices = [1, 4]`. + ''', + ) + + bond_list = Quantity( + type=np.int32, + # TODO improve description and add an example using the case in atom_indices + description=''' + List of pairs of atom indices corresponding to bonds (e.g., as defined by a force field) + within this atoms_group. + ''', + ) + + model_system = SubSection(sub_section=SectionProxy('ModelSystem'), repeats=True) + + def _resolve_system_type_and_dimensionality(self, ase_atoms: ase.Atoms) -> str: + ''' + Determine the ModelSystem.type and ModelSystem.dimensionality using MatID classification analyzer: + + - https://singroup.github.io/matid/tutorials/classification.html + + Args: + ase.Atoms: The ASE Atoms structure to analyse + ''' + classification = None + system_type, dimensionality = self.type, self.dimensionality + if len(ase_atoms) <= config.normalize.system_classification_with_clusters_threshold: + try: + classifier = Classifier(radii="covalent", cluster_threshold=config.normalize.cluster_threshold) + cls = classifier.classify(ase_atoms) + except Exception as e: + self.logger.warning('MatID system classification failed.', exc_info=e, error=str(e)) + return system_type, dimensionality + + classification = type(cls) + if classification == Class3D: + system_type = 'bulk' + dimensionality = '3D' + elif classification == Atom: + system_type = 'atom' + dimensionality = '3D' + elif classification == Class0D: + system_type = 'molecule / cluster' + dimensionality = '0D' + elif classification == Class1D: + system_type = '1D' + dimensionality = '1D' + elif classification == Surface: + system_type = 'surface' + dimensionality = '2D' + elif classification == Material2D or classification == Class2D: + system_type = '2D' + dimensionality = '2D' + else: + self.logger.info("ModelSystem.type and dimensionality analysis not run due to large system size.") + + return system_type, dimensionality + + def _resolve_bulk_symmetry(self, ase_atoms: ase.Atoms) -> None: + ''' + Analyze the symmetry of the material being simulated using MatID and the parsed data + stored under ModelSystem and AtomicCell. Only available for bulk materials. + + Args: + ase.Atoms: The ASE Atoms structure to analyse + ''' + symmetry = {} + try: + symmetry_analyzer = SymmetryAnalyzer(ase_atoms, symmetry_tol=config.normalize.symmetry_tolerance) + + symmetry['bravais_lattice'] = symmetry_analyzer.get_bravais_lattice() + symmetry['hall_symbol'] = symmetry_analyzer.get_hall_symbol() + symmetry['point_group_symbol'] = symmetry_analyzer.get_point_group() + symmetry['space_group_number'] = symmetry_analyzer.get_space_group_number() + symmetry['space_group_symbol'] = symmetry_analyzer.get_space_group_international_short() + symmetry['origin_shift'] = symmetry_analyzer._get_spglib_origin_shift() + symmetry['transformation_matrix'] = symmetry_analyzer._get_spglib_transformation_matrix() + + # Originally parsed cell + original_wyckoff = symmetry_analyzer.get_wyckoff_letters_original() + original_equivalent_atoms = symmetry_analyzer.get_equivalent_atoms_original() + + # Primitive cell + primitive_wyckoff = symmetry_analyzer.get_wyckoff_letters_primitive() + primitive_equivalent_atoms = symmetry_analyzer.get_equivalent_atoms_primitive() + primitive_sys = symmetry_analyzer.get_primitive_system() + primitive_pos = primitive_sys.get_scaled_positions() + primitive_cell = primitive_sys.get_cell() + primitive_num = primitive_sys.get_atomic_numbers() + primitive_labels = primitive_sys.get_chemical_symbols() + + # Standarized (or conventional) cell + standard_wyckoff = symmetry_analyzer.get_wyckoff_letters_conventional() + standard_equivalent_atoms = symmetry_analyzer.get_equivalent_atoms_conventional() + standard_sys = symmetry_analyzer.get_conventional_system() + standard_pos = standard_sys.get_scaled_positions() + standard_cell = standard_sys.get_cell() + standard_num = standard_sys.get_atomic_numbers() + standard_labels = standard_sys.get_chemical_symbols() + except ValueError as e: + self.logger.debug('Symmetry analysis with MatID is not available.', details=str(e)) + return + except Exception as e: + self.logger.warning('Symmetry analysis with MatID failed.', exc_info=e) + return + + # Populating the originally parsed AtomicCell wyckoff_letters and equivalent_atoms information + sec_original_atoms = self.atomic_cell[0] + sec_original_atoms.wyckoff_letters = original_wyckoff + sec_original_atoms.equivalent_particles = original_equivalent_atoms + + # Populating the primitive AtomicCell information + sec_primitive_atoms = self.m_create(AtomicCell) + sec_primitive_atoms.name = 'primitive' + sec_primitive_atoms.lattice_vectors = primitive_cell * ureg.angstrom + sec_primitive_atoms.n_atoms = len(primitive_labels) + sec_primitive_atoms.positions = primitive_pos * ureg.angstrom + sec_primitive_atoms.labels = primitive_labels + sec_primitive_atoms.atomic_numbers = primitive_num + sec_primitive_atoms.wyckoff_letters = primitive_wyckoff + sec_primitive_atoms.equivalent_atoms = primitive_equivalent_atoms + + # Populating the standarized Atoms information + sec_standard_atoms = self.m_create(AtomicCell) + sec_standard_atoms.name = 'standard' + sec_standard_atoms.lattice_vectors = standard_cell * ureg.angstrom + sec_standard_atoms.n_atoms = len(standard_labels) + sec_standard_atoms.positions = standard_pos * ureg.angstrom + sec_standard_atoms.labels = standard_labels + sec_standard_atoms.atomic_numbers = standard_num + sec_standard_atoms.wyckoff_letters = standard_wyckoff + sec_standard_atoms.equivalent_atoms = standard_equivalent_atoms + + # Getting prototype_formula, prototype_aflow_id, and strukturbericht designation from + # standarized Wyckoff numbers and the space group number + if symmetry.get('space_group_number'): + norm_wyckoff = get_normalized_wyckoff(standard_num, standard_wyckoff) + aflow_prototype = search_aflow_prototype(symmetry.get('space_group_number'), norm_wyckoff) + strukturbericht = aflow_prototype.get('Strukturbericht Designation') + if strukturbericht == 'None': + strukturbericht = None + else: + strukturbericht = re.sub('[$_{}]', '', strukturbericht) + prototype_aflow_id = aflow_prototype.get('aflow_prototype_id') + prototype_formula = aflow_prototype.get('Prototype') + # Adding these to the symmetry dictionary for later assignement + symmetry['strukturbericht_designation'] = strukturbericht + symmetry['prototype_aflow_id'] = prototype_aflow_id + symmetry['prototype_formula'] = prototype_formula + + # Populating Symmetry section (try to reference the standarized cell, and if not, + # fallback to the originally parsed one) + sec_symmetry = self.m_create(Symmetry) + for key, val in sec_symmetry.m_def.all_quantities.items(): + sec_symmetry.m_set(val, symmetry.get(key)) + sec_symmetry.atomic_cell_ref = self.atomic_cell[-1] + + def normalize(self, archive: EntryArchive, logger) -> None: + super().normalize(archive, logger) + self.logger = logger + + # We don't need to normalize if the system is not representative + # self.is_representative = True + if not self.is_representative: + return + + # Extracting ASE Atoms object from the originally parsed AtomicCell section + if len(self.atomic_cell) == 0: + self.logger.warning('Could not find the originally parsed atomic system. ' + 'Symmetry and Formula extraction is thus not run.') + return + self.atomic_cell[0].name = 'original' + ase_atoms = self.atomic_cell[0].m_cache.get('ase_atoms') + if not ase_atoms: + return + + # Resolving system `type`, `dimensionality`, and Symmetry section (if this last one does not exists already) + original_atom_positions = self.atomic_cell[0].positions + if original_atom_positions is not None: + self.type = 'unavailable' if not self.type else self.type + self.dimensionality = 'unavailable' if not self.dimensionality else self.dimensionality + self.type, self.dimensionality = self._resolve_system_type_and_dimensionality(ase_atoms) + if self.type == 'bulk' and len(self.symmetry) == 0: + self._resolve_bulk_symmetry(ase_atoms) + # Extracting the cells parameters using the object Cell from ASE + for atom_cell in self.atomic_cell: + atoms = AtomicCell().to_ase_atoms(atom_cell) + cell = atoms.get_cell() + atom_cell.a, atom_cell.b, atom_cell.c = cell.lengths() * ureg.angstrom + atom_cell.alpha, atom_cell.beta, atom_cell.gamma = cell.angles() * ureg.degree + atom_cell.volume = cell.volume * ureg.angstrom ** 3 + + # Formulas + # TODO add support for fractional formulas (possibly add `AtomicCell.concentrations` for each species) + try: + formula = Formula(ase_atoms.get_chemical_formula()) + self.chemical_composition = ase_atoms.get_chemical_formula(mode='all') + self.chemical_formula_descriptive = formula.format('descriptive') + self.chemical_formula_reduced = formula.format('reduced') + self.chemical_formula_iupac = formula.format('iupac') + self.chemical_formula_hill = formula.format('hill') + self.chemical_formula_anonymous = formula.format('anonymous') + self.elemental_composition = formula.elemental_composition() + except ValueError as e: + self.logger.warning('Could not extract the chemical formulas information.', exc_info=e, error=str(e))