From a40af98963e2f271f3ca4024e43091ed545c63a6 Mon Sep 17 00:00:00 2001 From: Killian Date: Thu, 21 Mar 2024 10:04:57 -0400 Subject: [PATCH] PTMMotifIdentifier --- chemicalmotifidentifier/__init__.py | 5 +- .../experimental/__init__.py | 0 .../experimental/dataset.py | 393 ++++++++++++++++++ .../experimental/frameworks_base.py | 48 +++ .../_frameworks.py | 34 ++ .../fcc_ternary_system.ipynb | 9 +- 6 files changed, 486 insertions(+), 3 deletions(-) create mode 100644 chemicalmotifidentifier/experimental/__init__.py create mode 100644 chemicalmotifidentifier/experimental/dataset.py create mode 100644 chemicalmotifidentifier/experimental/frameworks_base.py diff --git a/chemicalmotifidentifier/__init__.py b/chemicalmotifidentifier/__init__.py index 2640d58..8a14f91 100644 --- a/chemicalmotifidentifier/__init__.py +++ b/chemicalmotifidentifier/__init__.py @@ -1,6 +1,7 @@ from chemicalmotifidentifier._src.correlations import * from chemicalmotifidentifier._src.dataset import * from chemicalmotifidentifier._src.dissimilarity import * + # from chemicalmotifidentifier._src.frameworks import * from chemicalmotifidentifier._src.frameworks_base import * from chemicalmotifidentifier._src.ml import * @@ -9,5 +10,5 @@ from chemicalmotifidentifier._src.transforms import * from chemicalmotifidentifier._src.wc import * -# from .experimental.dataset import * -# from .experimental.frameworks import * +from .experimental.dataset import * +from .experimental.frameworks_base import * diff --git a/chemicalmotifidentifier/experimental/__init__.py b/chemicalmotifidentifier/experimental/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chemicalmotifidentifier/experimental/dataset.py b/chemicalmotifidentifier/experimental/dataset.py new file mode 100644 index 0000000..52b5a9d --- /dev/null +++ b/chemicalmotifidentifier/experimental/dataset.py @@ -0,0 +1,393 @@ +import numpy as np +import torch +from ovito.data import NearestNeighborFinder +from ovito.io import import_file +from ovito.modifiers import ( + CalculateDisplacementsModifier, + ExpressionSelectionModifier, + PolyhedralTemplateMatchingModifier, +) +from torch_geometric.data import Data, InMemoryDataset +from tqdm import tqdm + +###################### +# Ovito data defect +###################### + + +ROOT_2_OVER_2 = np.sqrt(2) / 2 +FCC = [ + [ROOT_2_OVER_2, ROOT_2_OVER_2, 0], + [0, ROOT_2_OVER_2, ROOT_2_OVER_2], + [ROOT_2_OVER_2, 0, ROOT_2_OVER_2], + [-ROOT_2_OVER_2, -ROOT_2_OVER_2, 0], + [0, -ROOT_2_OVER_2, -ROOT_2_OVER_2], + [-ROOT_2_OVER_2, 0, -ROOT_2_OVER_2], + [-ROOT_2_OVER_2, ROOT_2_OVER_2, 0], + [0, -ROOT_2_OVER_2, ROOT_2_OVER_2], + [-ROOT_2_OVER_2, 0, ROOT_2_OVER_2], + [ROOT_2_OVER_2, -ROOT_2_OVER_2, 0], + [0, ROOT_2_OVER_2, -ROOT_2_OVER_2], + [ROOT_2_OVER_2, 0, -ROOT_2_OVER_2], +] +ROOT_3_OVER_2 = np.sqrt(3) / 2 +ROOT_6_OVER_3 = np.sqrt(6) / 3 +ROOT_3_OVER_6 = np.sqrt(3) / 6 +ROOT_3_OVER_3 = np.sqrt(3) / 3 +HCP1 = [ + [0.5, -ROOT_3_OVER_2, 0], + [-1, 0, 0], + [-0.5, ROOT_3_OVER_6, -ROOT_6_OVER_3], + [0.5, ROOT_3_OVER_6, -ROOT_6_OVER_3], + [0, -ROOT_3_OVER_3, -ROOT_6_OVER_3], + [-0.5, ROOT_3_OVER_2, 0], + [0.5, ROOT_3_OVER_2, 0], + [1, 0, 0], + [-0.5, -ROOT_3_OVER_2, 0], + [0, -ROOT_3_OVER_3, ROOT_6_OVER_3], + [0.5, ROOT_3_OVER_6, ROOT_6_OVER_3], + [-0.5, ROOT_3_OVER_6, ROOT_6_OVER_3], +] +HCP2 = [ + [1, 0, 0], + [-0.5, -ROOT_3_OVER_2, 0], + [-0.5, -ROOT_3_OVER_6, -ROOT_6_OVER_3], + [0, ROOT_3_OVER_3, -ROOT_6_OVER_3], + [0.5, -ROOT_3_OVER_6, -ROOT_6_OVER_3], + [-1, 0, 0], + [-0.5, ROOT_3_OVER_2, 0], + [0.5, ROOT_3_OVER_2, 0], + [0.5, -ROOT_3_OVER_2, 0], + [0.5, -ROOT_3_OVER_6, ROOT_6_OVER_3], + [0, ROOT_3_OVER_3, ROOT_6_OVER_3], + [-0.5, -ROOT_3_OVER_6, ROOT_6_OVER_3], +] + + +def get_edges(vertexpositions, nn_dst, atol=0.1): + # Subtract each point from all the other points + diff = vertexpositions[:, np.newaxis, :] - vertexpositions[np.newaxis, :, :] + + # Compute the norm of the differences along the last axis + distances = np.linalg.norm(diff, axis=-1) + + mask_1nn_2_1nn = np.isclose(distances, nn_dst, atol=atol) + edge_index_0, edge_index_1 = np.where(mask_1nn_2_1nn) + + return edge_index_0, edge_index_1, diff[mask_1nn_2_1nn] + + +def get_matrix(q): + q = np.array(q) + qx = q[0] + qy = q[1] + qz = q[2] + qw = q[3] + matrices = np.empty((3, 3)) + matrices[0, 0] = 1.0 - 2.0 * (qy * qy + qz * qz) + matrices[0, 1] = 2.0 * (qx * qy - qw * qz) + matrices[0, 2] = 2.0 * (qx * qz + qw * qy) + matrices[1, 0] = 2.0 * (qx * qy + qw * qz) + matrices[1, 1] = 1.0 - 2.0 * (qx * qx + qz * qz) + matrices[1, 2] = 2.0 * (qy * qz - qw * qx) + matrices[2, 0] = 2.0 * (qx * qz - qw * qy) + matrices[2, 1] = 2.0 * (qy * qz + qw * qx) + matrices[2, 2] = 1.0 - 2.0 * (qx * qx + qy * qy) + + return matrices + + +class OvitoDataThermal(InMemoryDataset): + def __init__( + self, + root, + dump_file, + nneigh, + frame_number, + rounding_edge_vec_value, + crystal_structure, + rmsd_cutoff=0.1, + transform=None, + pre_transform=None, + pre_filter=None, + one_hot_dim=5, + ): + self.nneigh = nneigh + self.rmsd_cutoff = rmsd_cutoff + self.dump_file = dump_file + self.frame_number = frame_number + self.rounding_edge_vec_value = rounding_edge_vec_value + self.one_hot_dim = one_hot_dim + self.crystal_structure = crystal_structure + + structure_2_PTM = {"fcc": 1, "hcp": 2, "bcc": 3} + self.crystal_structure = crystal_structure + self.StructureType = structure_2_PTM[self.crystal_structure] + + # reference_structure = {"fcc": FCC, "hcp": HCP2} + # self.reference_structure_vecs = reference_structure[self.crystal_structure] + + nn_2_nn_dst_crystals = {"fcc": 1, "hcp": 1} + self.nn_2_nn_dst_crystals = nn_2_nn_dst_crystals[self.crystal_structure] + + remap_functions = {"fcc": self.remap_fcc_nn_vecs, "hcp": self.remap_hcp_nn_vecs} + self.remap_function = remap_functions[self.crystal_structure] + + super().__init__(root, transform, pre_transform, pre_filter) + self.data, self.slices = torch.load(self.processed_paths[0]) + + @property + def raw_file_names(self): + """If this file exists in raw_dir, the download is not triggered. + (The download func. is not implemented here) + """ + return ["not_implemented.pt"] + + @property + def processed_file_names(self): + """If these files are found in raw_dir, processing is skipped""" + return ["data.pt"] + + def download(self): + # Download to `self.raw_dir`. + pass + + def get_per_atom_energies(self, pipeline): + try: + data_relaxed = pipeline.compute(1) + c_peratom = np.array(data_relaxed.particles["c_peratom"]) + return c_peratom + # os.makedirs(self.root + "/per_atom_quantity/", exist_ok=True) + # torch.save(c_peratom, self.root + "/per_atom_quantity/energy.pt") + except: + print("No c_peratom quantity") + + def get_per_atom_displacements(self, pipeline): + try: + pipeline.modifiers.append(CalculateDisplacementsModifier()) + data_relaxed = pipeline.compute(1) + disp = np.array(data_relaxed.particles["Displacement Magnitude"]) + return disp + # os.makedirs(self.root + "/per_atom_quantity/", exist_ok=True) + # torch.save(disp, self.root + "/per_atom_quantity/disp.pt") + except: + print("No relaxed quantity") + + def get_pipeline_and_data(self): + # Overwirtting get pipeline to add selection modifiers to only select perfect structures + pipeline = import_file(self.dump_file) + print(self.dump_file) + + # PTM and selection modifier + + ptm = PolyhedralTemplateMatchingModifier() + ptm.only_selected = False + ptm.rmsd_cutoff = self.rmsd_cutoff + ptm.output_interatomic_distance = True + ptm.output_orientation = True + ptm.output_ordering = True + + pipeline.modifiers.append(ptm) + pipeline.modifiers.append( + ExpressionSelectionModifier( + expression=f"StructureType =={self.StructureType}" + ) + ) + + data = pipeline.compute(int(self.frame_number)) + + return pipeline, data + + @staticmethod + def get_angles(nn_vecs, reference_vecs): + dot_products = np.dot(nn_vecs, reference_vecs.T) + + vectors_magnitudes = np.linalg.norm(nn_vecs, axis=-1, keepdims=True) + fcc_magnitudes = np.linalg.norm(reference_vecs, axis=-1) + + magnitude_products = vectors_magnitudes * fcc_magnitudes + + cosine_thetas = dot_products / magnitude_products + cosine_thetas = np.clip(cosine_thetas, -1.0, 1.0) + + angles = np.arccos(cosine_thetas) + return angles + + def remap_hcp_nn_vecs(self, data, nn_vecs): + orientations = data.particles["Orientation"].array + structure_types = data.particles["Structure Type"].array + + rotation_matrices = np.array([get_matrix(q) for q in orientations]) + nn_vecs = np.einsum("aij,ajk->aik", nn_vecs, rotation_matrices) + + reference_vecs1 = np.array(HCP1) + reference_vecs2 = np.array(HCP2) + + angles_HCP1 = self.get_angles(nn_vecs, reference_vecs1) + angles_HCP2 = self.get_angles(nn_vecs, reference_vecs2) + + best_matches_HCP1 = np.argmin(angles_HCP1, axis=-1) + best_matches_HCP2 = np.argmin(angles_HCP2, axis=-1) + + avg_angle_diff_HCP1 = np.mean(np.min(angles_HCP1, axis=-1), axis=-1) + avg_angle_diff_HCP2 = np.mean(np.min(angles_HCP2, axis=-1), axis=-1) + + mapped_nn_vecs = reference_vecs2[best_matches_HCP2] + + best_matches = best_matches_HCP2 + mask = np.where(avg_angle_diff_HCP1 < avg_angle_diff_HCP2)[0] + + mapped_nn_vecs[mask] = reference_vecs1[best_matches_HCP1][mask] + + best_matches[mask] = best_matches_HCP1[mask] + + # masking out the ones that have more than 1 time a the same vecs + all_edge_different = np.cumsum(np.arange(self.nneigh))[-1] + + selected_particles = np.logical_and( + np.sum(best_matches, axis=-1) == all_edge_different, + np.array(structure_types) == self.StructureType, + ) + + best_match_selected = best_matches[selected_particles] + + assert len(np.unique(np.sum(best_match_selected, axis=-1))) == 1 + return mapped_nn_vecs, np.where(selected_particles == True)[0] + + def remap_fcc_nn_vecs(self, data, nn_vecs): + orientations = data.particles["Orientation"].array + structure_types = data.particles["Structure Type"].array + + rotation_matrices = np.array([get_matrix(q) for q in orientations]) + nn_vecs = np.einsum("aij,ajk->aik", nn_vecs, rotation_matrices) + + reference_vecs = np.array(FCC) + # Computing angle + angles = self.get_angles(nn_vecs, reference_vecs) + best_matches = np.argmin(angles, axis=-1) + + nn_vecs_match = reference_vecs[best_matches] + + # masking out the ones that have more than 1 time a the same vecs + all_edge_different = np.cumsum(np.arange(self.nneigh))[-1] + + selected_particles = np.logical_and( + np.sum(best_matches, axis=-1) == all_edge_different, + np.array(structure_types) == self.StructureType, + ) + + best_match_selected = best_matches[selected_particles] + + assert len(np.unique(np.sum(best_match_selected, axis=-1))) == 1 + + return nn_vecs_match, np.where(selected_particles == True)[0] + + def process(self): + # Read data into huge `Data` list. + + pipeline, data = self.get_pipeline_and_data() + + # get per atom properties + energy = self.get_per_atom_energies(pipeline) + disp = self.get_per_atom_displacements(pipeline) + + # get atom types + atom_types = np.array(data.particles.particle_types) + + natoms = data.particles.count + + # setup onehot encoder of atom types + atom_types_one_hot = torch.eye(self.one_hot_dim, dtype=torch.float64)[ + atom_types - 1 + ] + + # setup NN finder + finder = NearestNeighborFinder(self.nneigh, data) + nn_indexs, nn_vecs = finder.find_all() + + nn_vecs, selected_particles = self.remap_function(data, nn_vecs) + + atom_indices = torch.arange(natoms) + # create neighborhood matrix + neighborhood = torch.zeros((natoms, self.nneigh + 1), dtype=int) + neighborhood[:, 0] = atom_indices + neighborhood[:, 1:] = torch.from_numpy(nn_indexs) + + data_list = [] + + # loop over central atom i + for iatom in tqdm(range(natoms), desc="Dataset atom i..."): + # central atom i at index 0 with its NN + ineighborhood = neighborhood[iatom] + + # get edge index for 1nn-1nn edges, shift them by one since first atom is center + edge_index_0, edge_index_1, edge_vec = get_edges( + nn_vecs[iatom], nn_dst=self.nn_2_nn_dst_crystals, atol=0.1 + ) + edge_index_0 = edge_index_0 + 1 + edge_index_1 = edge_index_1 + 1 + + # Add central atom to 1nn information + + edge_index_0 = np.concatenate( + ( + [0] * self.nneigh, + np.arange(1, self.nneigh + 1), + edge_index_0, + ) + ) + edge_index_1 = np.concatenate( + ( + np.arange(1, self.nneigh + 1), + [0] * self.nneigh, + edge_index_1, + ) + ) + + edge_vec = np.concatenate((nn_vecs[iatom], nn_vecs[iatom], edge_vec)) + + edge_index_0 = torch.from_numpy(edge_index_0) + edge_index_1 = torch.from_numpy(edge_index_1) + edge_vec = torch.from_numpy(edge_vec) + + edge_index = torch.stack((edge_index_0, edge_index_1)) + + edge_vec = torch.tensor(edge_vec, dtype=torch.float64) + + node_attr = atom_types_one_hot[ineighborhood] + + num_edges = edge_index.shape[1] + node_input = torch.ones(self.nneigh + 1, 1, dtype=torch.double) + edge_attr = torch.ones(num_edges, 1, dtype=torch.double) + + g = { + "edge_index": edge_index, + "edge_vec": edge_vec, + "node_attr": node_attr, + "index": iatom, + "edge_attr": edge_attr, + "node_input": node_input, + } + + if disp is not None: + g["disp"] = disp[iatom] + + if energy is not None: + g["energy"] = energy[iatom] + + graph = Data(**g) + data_list.append(graph) + + if selected_particles is not None: + # selected_particles = np.where(np.array(selected_mask) == 1)[0] + + data_list = [data_list[i] for i in selected_particles] + + if self.pre_filter is not None: + data_list = [data for data in data_list if self.pre_filter(data)] + + if self.pre_transform is not None: + data_list = [self.pre_transform(data) for data in data_list] + + data, slices = self.collate(data_list) + torch.save((data, slices), self.processed_paths[0]) diff --git a/chemicalmotifidentifier/experimental/frameworks_base.py b/chemicalmotifidentifier/experimental/frameworks_base.py new file mode 100644 index 0000000..6a10e9b --- /dev/null +++ b/chemicalmotifidentifier/experimental/frameworks_base.py @@ -0,0 +1,48 @@ + +from torch_geometric.transforms import Compose + +from .._src.frameworks_base import BaseChemicalMotifIdentifier +from .._src.transforms import AddEdges, ModuloConcentration, RemoveCentralNode +from .dataset import OvitoDataThermal + + +class BasePTMMotifIdentifier(BaseChemicalMotifIdentifier): + def __init__(self, rmsd_cutoff=0.1, **kwargs): + self.mode = "synthetic" + self.rmsd_cutoff = rmsd_cutoff + super().__init__(**kwargs) + + dataset_config = { + "pre_filter": None, + "one_hot_dim": 5, + "crystal_structure": self.crystal_structure, + "rmsd_cutoff": self.rmsd_cutoff, + } + + dataset_object = OvitoDataThermal + self.set_dataset_modes(dataset_config, dataset_object) + self.import_synthetic() + + self.nelement = 5 + self.transfroms = { + "fcc": Compose( + [ + RemoveCentralNode(), + ModuloConcentration(self.nelement, self.num_nodes), + ] + ), + "bcc": Compose( + [ + AddEdges(8), + RemoveCentralNode(), + ModuloConcentration(self.nelement, self.num_nodes), + ] + ), + "hcp": Compose( + [ + RemoveCentralNode(), + ModuloConcentration(self.nelement, self.num_nodes), + ] + ), + } + \ No newline at end of file diff --git a/examples/doi-10.48550-arXiv.2311.01545/_frameworks.py b/examples/doi-10.48550-arXiv.2311.01545/_frameworks.py index 741051d..e7112f2 100644 --- a/examples/doi-10.48550-arXiv.2311.01545/_frameworks.py +++ b/examples/doi-10.48550-arXiv.2311.01545/_frameworks.py @@ -4,6 +4,7 @@ from chemicalmotifidentifier import ( BaseMonteCarloChemicalMotifIdentifier, + BasePTMMotifIdentifier, BaseSyntheticChemicalMotifIdentifier, ) @@ -111,3 +112,36 @@ def import_model_config(self): "mul": 50, # 50 } self.model_config = model_config + + +class PTMMotifIdentifier(BasePTMMotifIdentifier): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.nelement = 3 + self.dataset_config["one_hot_dim"] = 3 + + def import_synthetic(self): + """Import chemical shell synthetic dataset pandas dataframe""" + self.df_synthetic = pd.read_pickle( + f"data/inputs_doi-10.48550-arXiv.2311.01545/df_{self.crystal_structure}.pkl" + ) + + def import_model_config(self): + lmax, layers, outlength, number = 1, 1, 100, 0 # 2,2,4,0 + + model_config = { + "out_feature_length": outlength, + "max_radius": 2.5, + "min_radius": 0, + "number_of_basis": 10, + "num_nodes": 12, # only used for feqtur enormalization, we don't really care set it to a constant so that we can have only 1 network for all the crystal structure + "num_neighbors": 11, # only used for feqtur enormalization, we don't really care set it to a constant so that we can have only 1 network for all the crystal structure + "layers": layers, + "lmax": lmax, + "net_number": number, + "irreps_node_attr": "3x0e", + # "model_load": f"/home/ksheriff/PAPERS/first_paper/02_1nn_synthetic/data/nets/net_{lmax}-{layers}-{outlength}_{number}.pt", + "model_load": "data/inputs_doi-10.48550-arXiv.2311.01545/net.pt", + "mul": 50, # 50 + } + self.model_config = model_config diff --git a/examples/doi-10.48550-arXiv.2311.01545/fcc_ternary_system.ipynb b/examples/doi-10.48550-arXiv.2311.01545/fcc_ternary_system.ipynb index eb31298..e39ecdc 100644 --- a/examples/doi-10.48550-arXiv.2311.01545/fcc_ternary_system.ipynb +++ b/examples/doi-10.48550-arXiv.2311.01545/fcc_ternary_system.ipynb @@ -9,7 +9,7 @@ "This is a quick tutorial on:\n", "1. Analytically obtaining a pattern inventory for a ternary system (CrCoNi) in the fcc crystal structure. \n", "2. Obtaining the pattern inventory with ML and creating a physically constrained embedding space from which we can compute dissimilarities between motifs. \n", - "3. Computing dissimilarity between motifs in atomistic data." + "3. Chemical motif identification in atomistic data and computing dissimilarity between motifs in atomistic data." ] }, { @@ -505,6 +505,13 @@ "df.to_pickle(\"data/mc/outputs/eca_id/300K/df_microstates.pkl\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, you can use the ``_frameworks.PTMMotifIdentifier'' to only identify motifs in specific crystal structures and for correcting their distortions." + ] + }, { "cell_type": "markdown", "metadata": {},