diff --git a/mlspm/cli.py b/mlspm/cli.py index 5770f66..409287f 100644 --- a/mlspm/cli.py +++ b/mlspm/cli.py @@ -1,4 +1,5 @@ import argparse +from typing import Optional def _bool_type(value): @@ -9,10 +10,13 @@ def _bool_type(value): raise KeyError(f"`{value}` can't be interpreted as a boolean.") -def parse_args() -> dict: +def parse_args(argv: Optional[list[str]] = None) -> dict: """ Parse some useful CLI arguments for use in training scripts. + Arguments: + argv: List of argument values. Defaults to ``sys.argv``. + Returns: A dictionary of the argument values. """ @@ -68,5 +72,5 @@ def parse_args() -> dict: parser.add_argument( "--avg_best_epochs", type=int, default=3, help="Number of epochs to average the best validation loss over. Default = 3." ) - args = parser.parse_args() + args = parser.parse_args(argv) return vars(args) diff --git a/mlspm/data_loading.py b/mlspm/data_loading.py index a5524da..4044e03 100644 --- a/mlspm/data_loading.py +++ b/mlspm/data_loading.py @@ -136,7 +136,7 @@ def decode_xyz(key: str, data: Any) -> Tuple[np.ndarray, np.ndarray] | Tuple[Non sw = get_scan_window_from_comment(comment) xyz = [] while line := data.readline().decode("utf-8"): - e, x, y, z, _ = line.strip().split() + e, x, y, z = line.strip().split()[:4] try: e = int(e) except ValueError: @@ -184,7 +184,7 @@ def get_scan_window_from_comment(comment: str) -> np.ndarray: return sw -def _rotate_and_stack(src: Iterable[dict], reverse: bool = True) -> Generator[dict, None, None]: +def _rotate_and_stack(src: Iterable[dict], reverse: bool = False) -> Generator[dict, None, None]: """ Take a sample in dict format and update it with fields containing an image stack, xyz coordinates and scan window. Rotate the images to be xy-indexing convention and stack them into a single array. @@ -194,7 +194,7 @@ def _rotate_and_stack(src: Iterable[dict], reverse: bool = True) -> Generator[di Arguments: src: Iterable of dicts with the fields: - - ``'{000..0xx}.jpg'`` - :class:`PIL.Image.Image` of one slice of the simulation. + - ``'{000..0xx}.{jpg,png}'`` - :class:`PIL.Image.Image` of one slice of the simulation. - ``'xyz'`` - Tuple(:class:`np.ndarray`, :class:`np.ndarray`) of the xyz data and the scan window. reverse: Whether the order of the image stack is reversed. diff --git a/mlspm/graph/_molecule_graph.py b/mlspm/graph/_molecule_graph.py index 60a90fb..7183843 100755 --- a/mlspm/graph/_molecule_graph.py +++ b/mlspm/graph/_molecule_graph.py @@ -276,8 +276,10 @@ def transform_xy( """ Transform atom positions in the xy plane. + Transformations are perfomed in the order: shift, rotate, flip x, flip y + Arguments: - shift: Shift atom positions in xy plane. Performed before rotation and flip. + shift: Shift atom positions in xy plane. rot_xy: Rotate atoms in xy plane by rot_xy degrees around center point. flip_x: Mirror atom positions in x direction with respect to the center point. flip_y: Mirror atom positions in y direction with respect to the center point. diff --git a/mlspm/graph/_utils.py b/mlspm/graph/_utils.py index e0b8e80..b3f3d10 100644 --- a/mlspm/graph/_utils.py +++ b/mlspm/graph/_utils.py @@ -419,6 +419,7 @@ def save_graphs_to_xyzs( Arguments: molecules: Molecule graphs to save. classes: Chemical elements for atom classification. Either atomic numbers of chemical symbols. + The element for each atom in the graph is the first element in the corresponding class. outfile_format: Formatting string for saved files. Sample index is available in variable ``ind``. start_ind: Index where file numbering starts. verbose: Whether to print output information. diff --git a/mlspm/logging.py b/mlspm/logging.py index 5678c50..92cb14a 100644 --- a/mlspm/logging.py +++ b/mlspm/logging.py @@ -204,10 +204,6 @@ def __init__( self._synced_losses = {"train": SyncedLoss(len(self.loss_labels)), "val": SyncedLoss(len(self.loss_labels))} self._init_log(init_epoch) - def __del__(self): - if self.stream is not sys.stdout: - self.stream.close() - def _init_log(self, init_epoch: Optional[int]): log_exists = os.path.isfile(self.log_path) if self.world_size > 1: diff --git a/papers/ice_structure_discovery/training/fit_posnet.py b/papers/ice_structure_discovery/training/fit_posnet.py index f3bd8db..dbae516 100644 --- a/papers/ice_structure_discovery/training/fit_posnet.py +++ b/papers/ice_structure_discovery/training/fit_posnet.py @@ -240,14 +240,7 @@ def run(cfg): lr_decay.step() # Log losses - try: - loss_logger.add_train_loss(loss) - except ValueError as e: - torch.save(model.module.state_dict(), save_path := os.path.join(cfg['run_dir'], 'debug_model.pth')) - with open('debug_data', 'wb') as f: - pickle.dump((X.cpu().numpy(), ref.cpu().numpy()), f) - print(f'Save debug data on rank {cfg["global_rank"]}') - raise e + loss_logger.add_train_loss(loss) if cfg['timings'] and cfg['global_rank'] == 0: torch.cuda.synchronize() diff --git a/tests/integration_tests/test_train_posnet.py b/tests/integration_tests/test_train_posnet.py new file mode 100644 index 0000000..e0c2f97 --- /dev/null +++ b/tests/integration_tests/test_train_posnet.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +import os +import random +import shutil +import tarfile +from functools import partial +from pathlib import Path + +import numpy as np +import torch +import webdataset as wds +from torch import nn, optim + +import mlspm.data_loading as dl +import mlspm.preprocessing as pp +from mlspm import graph, utils +from mlspm.cli import parse_args +from mlspm.logging import LossLogPlot +from mlspm.models import PosNet + +from PIL import Image + + +def make_model(device, cfg): + outsize = round((cfg["z_lims"][1] - cfg["z_lims"][0]) / cfg["box_res"][2]) + 1 + model = PosNet( + encode_block_channels=[2, 4, 8, 16], + encode_block_depth=2, + decode_block_channels=[16, 8, 4], + decode_block_depth=1, + decode_block_channels2=[16, 8, 4], + decode_block_depth2=1, + attention_channels=[16, 16, 16], + res_connections=True, + activation="relu", + padding_mode="zeros", + pool_type="avg", + decoder_z_sizes=[5, 10, outsize], + z_outs=[3, 3, 5, 8], + peak_std=cfg["peak_std"], + device=device + ) + criterion = nn.MSELoss(reduction="mean") + optimizer = optim.Adam(model.parameters(), lr=cfg["lr"]) + lr_decay_rate = 1e-5 + lr_decay = optim.lr_scheduler.LambdaLR(optimizer, lambda b: 1.0 / (1.0 + lr_decay_rate * b)) + return model, criterion, optimizer, lr_decay + + +def make_test_data(cfg): + out_dir = Path(cfg["data_dir"]) + out_dir.mkdir(exist_ok=True) + urls = wds.shardlists.expand_urls(cfg["urls_train"]) + i_sample = 0 + for url in urls: + temp_dir = Path(f"temp_{url}") + temp_dir.mkdir(exist_ok=True) + os.chdir(temp_dir) + with tarfile.open(url, "w") as f: + for _ in range(10): + afm = np.random.randint(0, 255, (64, 64, 8), dtype=np.uint8) + for i in range(afm.shape[-1]): + img_path = f"{i_sample}.{i}.png" + Image.fromarray(afm[:, ::-1, i].T).save(img_path) + f.add(img_path) + xyz = np.random.rand(8, 3) + xyz[:, :2] *= 8 + atoms = np.concatenate([xyz, np.random.randint(1, 10, (8, 1))], axis=1) + xyz_path = f"{i_sample}.xyz" + utils.write_to_xyz(atoms, outfile=xyz_path, comment_str="Scan window: [[0.0 0.0 0.0], [8.0 8.0 1.0]]", verbose=0) + f.add(xyz_path) + i_sample += 1 + os.chdir("..") + (temp_dir / url).rename(out_dir / url) + shutil.rmtree(temp_dir) + + +def apply_preprocessing(batch, cfg): + box_res = cfg["box_res"] + z_lims = cfg["z_lims"] + zmin = cfg["zmin"] + peak_std = cfg["peak_std"] + + X, atoms, scan_windows = [batch[k] for k in ["X", "xyz", "sw"]] + + nz_max = X[0].shape[-1] + nz = random.choice(range(1, nz_max + 1)) + z0 = random.choice(range(0, min(5, nz_max + 1 - nz))) + X = [x[:, :, :, -nz:] for x in X] if z0 == 0 else [x[:, :, :, -(nz + z0) : -z0] for x in X] + + atoms = [a[a[:, -1] != 29] for a in atoms] + pp.top_atom_to_zero(atoms) + xyz = atoms.copy() + mols = [graph.MoleculeGraph(a, []) for a in atoms] + mols, sw = graph.shift_mols_window(mols, scan_windows[0]) + + pp.rand_shift_xy_trend(X, max_layer_shift=0.02, max_total_shift=0.04) + box_borders = graph.make_box_borders(X[0].shape[1:3], res=box_res[:2], z_range=z_lims) + X, mols, box_borders = graph.add_rotation_reflection_graph( + X, mols, box_borders, num_rotations=1, reflections=True, crop=(32, 32), per_batch_item=True + ) + pp.add_norm(X) + pp.add_gradient(X, c=0.3) + pp.add_noise(X, c=0.1, randomize_amplitude=True, normal_amplitude=True) + pp.add_cutout(X, n_holes=5) + + mols = graph.threshold_atoms_bonds(mols, zmin) + ref = graph.make_position_distribution(mols, box_borders, box_res=box_res, std=peak_std) + + return X, [ref], xyz, box_borders + + +def make_webDataloader(cfg): + shard_list = dl.ShardList( + cfg[f"urls_train"], + base_path=cfg["data_dir"], + substitute_param=True, + log=Path(cfg["run_dir"]) / "shards.log", + ) + + dataset = wds.WebDataset(shard_list) + dataset.pipeline.pop() + dataset.append(wds.tariterators.tarfile_to_samples()) + dataset.append(wds.split_by_worker) + dataset.append(wds.decode("pill", dl.decode_xyz)) + dataset.append(dl.rotate_and_stack()) + dataset.append(dl.batched(cfg["batch_size"])) + dataset = dataset.map(partial(apply_preprocessing, cfg=cfg)) + + dataloader = wds.WebLoader( + dataset, + num_workers=cfg["num_workers"], + batch_size=None, + pin_memory=True, + collate_fn=dl.default_collate, + persistent_workers=False, + ) + + return dataset, dataloader + + +def batch_to_device(batch, device): + X, ref, *rest = batch + X = X[0].to(device) + ref = ref[0].to(device) + return X, ref, *rest + + +def run(cfg): + device = "cuda" if torch.cuda.is_available() else "cpu" + + # Create run directory + if not os.path.exists(cfg["run_dir"]): + os.makedirs(cfg["run_dir"]) + + # Define model, optimizer, and loss + model, criterion, optimizer, lr_decay = make_model(device, cfg) + + # Setup checkpointing and load a checkpoint if available + checkpointer = utils.Checkpointer( + model, + optimizer, + additional_data={"lr_params": lr_decay}, + checkpoint_dir=os.path.join(cfg["run_dir"], "Checkpoints/"), + keep_last_epoch=True, + ) + init_epoch = checkpointer.epoch + + # Setup logging + log_file = open(os.path.join(cfg["run_dir"], "batches.log"), "a") + loss_logger = LossLogPlot( + log_path=os.path.join(cfg["run_dir"], "loss_log.csv"), + plot_path=os.path.join(cfg["run_dir"], "loss_history.png"), + loss_labels=cfg["loss_labels"], + loss_weights=cfg["loss_weights"], + print_interval=cfg["print_interval"], + init_epoch=init_epoch, + stream=log_file, + ) + + for epoch in range(cfg["epochs"]): + # Create datasets and dataloaders + _, train_loader = make_webDataloader(cfg) + val_loader = train_loader + + print(f"\n === Epoch {epoch}") + + model.train() + for ib, batch in enumerate(train_loader): + # Transfer batch to device + X, ref, _, _ = batch_to_device(batch, device) + + # Forward + pred, _ = model(X) + loss = criterion(pred, ref) + + # Backward + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + lr_decay.step() + + # Log losses + loss_logger.add_train_loss(loss) + + print(f"Train batch {ib}") + + # Validate + + model.eval() + with torch.no_grad(): + for ib, batch in enumerate(val_loader): + # Transfer batch to device + X, ref, _, _ = batch_to_device(batch, device) + + # Forward + pred, _ = model(X) + loss = criterion(pred, ref) + + loss_logger.add_val_loss(loss) + + print(f"Val batch {ib}") + + # Write average losses to log and report to terminal + loss_logger.next_epoch() + + # Save checkpoint + checkpointer.next_epoch(loss_logger.val_losses[-1][0]) + + # Return to best epoch, and save model weights + checkpointer.revert_to_best_epoch() + print(f"Best validation loss on epoch {checkpointer.best_epoch}: {checkpointer.best_loss}") + + log_file.close() + shutil.rmtree(cfg["run_dir"]) + shutil.rmtree(cfg["data_dir"]) + + +def test_train_posnet(): + # fmt:off + cfg = parse_args( + [ + "--run_dir", "test_train", + "--epochs", "2", + "--batch_size", "4", + "--z_lims", "-1.0", "0.5", + "--zmin", "-1.0", + "--data_dir", "./test_data", + "--urls_train", "train-K-{0..1}_{0..1}.tar", + "--box_res", "0.125", "0.125", "0.100", + "--peak_std", "0.20", + "--lr", "1e-4" + ] + ) + # fmt:on + + make_test_data(cfg) + run(cfg) + + +if __name__ == "__main__": + test_train_posnet() diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..389e51b --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,15 @@ + +import pytest + +def test_parse_args(): + from mlspm.cli import parse_args + + args = parse_args(["--train", "false", "--predict", "False", '--test', "true", "--classes", "1,2,3", "4,5,6"]) + + assert args["train"] == False + assert args["predict"] == False + assert args["test"] == True + assert args["classes"] == [[1, 2, 3], [4, 5, 6]] + + with pytest.raises(KeyError): + parse_args(["--train", "fals"]) diff --git a/tests/test_data_loading.py b/tests/test_data_loading.py new file mode 100644 index 0000000..529a917 --- /dev/null +++ b/tests/test_data_loading.py @@ -0,0 +1,166 @@ +import os +from pathlib import Path + +import numpy as np +import pytest +import torch +from PIL import Image + + +def test_shardlist(): + from mlspm.data_loading import ShardList + + urls = "test_K-{0..1}_{0..1}" + base_path = "./base/" + log_path = Path("shards.log") + + # Test with subtitution + shard_list = ShardList(urls=urls, base_path=base_path, world_size=1, rank=0, substitute_param=True, log=log_path) + + assert len(list(shard_list)) == 2 + + # Test without subtitution + shard_list = ShardList(urls=urls, base_path=base_path, world_size=1, rank=0, substitute_param=False, log=log_path) + + shards = list(shard_list) + shards_expected = [ + {"url": "./base/test_K-0_0"}, + {"url": "./base/test_K-0_1"}, + {"url": "./base/test_K-1_0"}, + {"url": "./base/test_K-1_1"}, + ] + assert shards == shards_expected + + # Test splitting over ranks + shard_list_rank0 = ShardList(urls=urls, base_path=base_path, world_size=2, rank=0, substitute_param=False, log=log_path) + shard_list_rank1 = ShardList(urls=urls, base_path=base_path, world_size=2, rank=1, substitute_param=False, log=log_path) + + shards = list(shard_list_rank0) + shards_expected = [ + {"url": "./base/test_K-0_0"}, + {"url": "./base/test_K-1_0"}, + ] + assert shards == shards_expected + + shards = list(shard_list_rank1) + shards_expected = [ + {"url": "./base/test_K-0_1"}, + {"url": "./base/test_K-1_1"}, + ] + assert shards == shards_expected + + # Test url list filling (list size not divisible by world_size) + shard_list_rank0 = ShardList(urls=urls, base_path=base_path, world_size=3, rank=0, substitute_param=False, log=log_path) + shard_list_rank1 = ShardList(urls=urls, base_path=base_path, world_size=3, rank=1, substitute_param=False, log=log_path) + shard_list_rank2 = ShardList(urls=urls, base_path=base_path, world_size=3, rank=2, substitute_param=False, log=log_path) + + assert len(list(shard_list_rank0)) == 2 + assert len(list(shard_list_rank1)) == 2 + assert len(list(shard_list_rank2)) == 2 + + # Test url list filling (world size at least twice as big as the list size) + shard_list = ShardList(urls=urls, base_path=base_path, world_size=9, rank=8, substitute_param=False, log=log_path) + assert len(list(shard_list)) == 1 + + with pytest.raises(ValueError): + # With substitute_param=True, requires "K_{num}" in urls + shard_list = ShardList(urls="test_{0..1}", substitute_param=True, log=log_path) + list(shard_list) + + if log_path.exists(): + os.remove(log_path) + + +def test_decode_xyz(): + from mlspm.data_loading import decode_xyz + from mlspm.utils import read_xyzs, write_to_xyz + + xyz_path = "./test_decode.xyx" + # fmt: off + xyz = np.array([ + [ 0.0, 0.1, 0.2, 1], # (x, y, z, element) + [ 0.0, -0.1, -0.2, 6], + [-0.2, 1.5, -2.5, 14]] + ) + # fmt: on + + write_to_xyz(xyz, xyz_path, comment_str="Scan window: [[-1.0 1.0 .2], [10.0 20 15.0]]`") + with open(xyz_path, "rb") as f: + xyz_bytes = f.read() + + os.remove(xyz_path) + + xyz_decoded, scan_window = decode_xyz(".xyz", data=xyz_bytes) + + assert np.allclose(xyz, xyz_decoded) + assert np.allclose(scan_window, np.array([[-1.0, 1.0, 0.2], [10.0, 20.0, 15.0]])) + + a, b = decode_xyz(".png", data=xyz_bytes) + + assert a is None + assert b is None + + +def test_rotate_and_stack(): + from mlspm.data_loading import _rotate_and_stack + + img0 = np.random.randint(0, 255, (10, 10), dtype=np.uint8) + img1 = np.random.randint(0, 255, (10, 10), dtype=np.uint8) + img0_pil = Image.fromarray(img0) + img1_pil = Image.fromarray(img1) + + # fmt: off + xyz = np.array([ + [ 0.0, 0.1, 0.2, 1], # (x, y, z, element) + [ 0.0, -0.1, -0.2, 6], + [-0.2, 1.5, -2.5, 14]] + ) + # fmt: on + sw = np.array([[-1.0, 1.0, 0.2], [10.0, 20.0, 15.0]]) + + src = [{"000.png": img0_pil, "001.png": img1_pil, "xyz": (xyz, sw)}] + + samples = list(_rotate_and_stack(src, reverse=False)) + + sample = samples[0] + X = sample["X"] + xyz_out = sample["xyz"] + sw_out = sample["sw"] + + assert X.shape == (1, 10, 10, 2) + assert sw_out.shape == (1, 2, 3) + assert xyz_out.shape == (3, 4) + + assert np.allclose(X[0, :, ::-1, 0].T, img0) + assert np.allclose(X[0, :, ::-1, 1].T, img1) + assert np.allclose(xyz, xyz_out) + assert np.allclose(sw, sw_out) + +def test_collate_batch(): + + from mlspm.data_loading import _collate_batch + + batch = [ + { + 'X': np.random.randint(0, 255, (1, 10, 10, 5), dtype=np.uint8), + 'Y': np.random.randint(0, 255, (1, 10, 10), dtype=np.uint8), + 'xyz': np.random.rand(3, 4), + 'sw': np.random.rand(1, 2, 3) + } + for _ in range(3) + ] + + batch = _collate_batch(batch) + + X = batch['X'] + Y = batch['Y'] + xyz = batch['xyz'] + sw = batch['sw'] + + assert len(X) == len(Y) == len(sw) == 1 + assert X[0].shape == (3, 10, 10, 5) + assert Y[0].shape == (3, 10, 10) + assert sw[0].shape == (3, 2, 3) + assert len(xyz) == 3 + for a in xyz: + assert a.shape == (3, 4) diff --git a/tests/test_graph.py b/tests/test_graph.py index 14fe15b..bd580a0 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -1,11 +1,16 @@ -import torch -import pytest +import os +import shutil +from pathlib import Path + import numpy as np +import pytest +import torch +from scipy.spatial.distance import cdist def test_collate_graph(): - from mlspm.graph import MoleculeGraph from mlspm.data_loading import collate_graph + from mlspm.graph import MoleculeGraph # fmt: off @@ -215,7 +220,7 @@ def test_molecule_graph_array(): def test_molecule_graph_remove_atoms(): - from mlspm.graph import MoleculeGraph, Atom + from mlspm.graph import Atom, MoleculeGraph # fmt: off @@ -243,7 +248,7 @@ def test_molecule_graph_remove_atoms(): assert removed == [] new_molecule, removed = molecule.remove_atoms([1]) - removed_expected = [(Atom(np.array([1.0, 1.0, 0.0]), 2), [0, 1, 0])] + removed_expected = [(Atom(np.array([1.0, 1.0, 0.0]), 'H'), [0, 1, 0])] atoms_expected = np.array([ [0.0, 0.0, 0.0, 1], [1.0, 0.0, 0.0, 3], @@ -331,10 +336,80 @@ def test_molecule_graph_add_atom(): assert a == b -def test_GraphSeqStats(): - from mlspm.graph import GraphStats +def test_molecule_graph_transform_xy(): + from mlspm.graph import MoleculeGraph + + # fmt:off + atoms = np.array([ + [0.0, 0.0, 0.0, 1], + [1.0, 1.0, 1.0, 2], + [1.0, 0.0, 0.0, 3], + [2.0, 0.0, -1.0, 4] + ]) + # fmt:on + bonds = [(0, 2), (1, 2), (2, 3)] + molecule = MoleculeGraph(atoms, bonds) + + molecule_transformed = molecule.transform_xy(shift=(1, 1), rot_xy=90, flip_x=True, flip_y=True, center=(1, 1)) + + xyz_transformed = molecule_transformed.array(xyz=True) + # fmt:off + xyz_expected = np.array( + [ + [1.0, 1.0, 0.0], + [2.0, 0.0, 1.0], + [1.0, 0.0, 0.0], + [1.0, -1.0, -1.0] + ] + ) + # fmt:on + + assert np.allclose(xyz_transformed, xyz_expected) + + +def test_molecule_graph_crop_atoms(): + from mlspm.graph import MoleculeGraph + + # fmt:off + atoms = np.array([ + [0.0, 0.0, 0.0, 1], + [1.0, 1.0, 1.0, 2], + [1.0, 0.0, 0.0, 3], + [2.0, 0.0, -1.0, 4] + ]) + # fmt:on + bonds = [(0, 2), (1, 2), (2, 3)] + molecule = MoleculeGraph(atoms, bonds) + box_borders = np.array([[-0.5, -0.5, -0.5], [1.5, 1.5, 0.5]]) + + molecule_cropped = molecule.crop_atoms(box_borders) + + xyz_cropped = molecule_cropped.array(xyz=True) + # fmt:off + xyz_expected = np.array( + [ + [0.0, 0.0, 0.0], + [1.0, 0.0, 0.0], + ] + ) + # fmt:on + + assert np.allclose(xyz_cropped, xyz_expected) + assert molecule_cropped.bonds == [(0, 1)] + + +def test_molecule_graph_randomize_positions(): from mlspm.graph import MoleculeGraph + molecule = MoleculeGraph(np.zeros((3, 4)), []) + molecule_randomized = molecule.randomize_positions() + + assert not np.allclose(molecule_randomized.array(xyz=True), 0.0) + + +def test_GraphSeqStats(): + from mlspm.graph import GraphStats, MoleculeGraph + classes = [[0], [1], [2]] # fmt:off @@ -434,7 +509,143 @@ def test_GraphSeqStats(): seq_stats = GraphStats(classes=classes, bin_size=1) seq_stats.add_batch(pred, ref) - # seq_stats.plot('./seq_stats') - # seq_stats.report('./seq_stats') + outdir = Path('./test_stats') + seq_stats.plot(outdir) + seq_stats.report(outdir) + shutil.rmtree(outdir) # fmt:on + + +def test_peaks(): + from mlspm.graph import MoleculeGraph, find_gaussian_peaks, make_position_distribution + + # fmt:off + atoms = np.array([ + [2.0, 2.0, 1.0, 0], + [2.0, 6.0, 2.0, 0], + [6.0, 4.0, 2.0, 0], + [4.0, 2.0, 2.0, 0] + ]) + # fmt:on + box_borders = np.array([[0.0, 0.0, 0.0], [8.0, 8.0, 3.0]]) + mols = [MoleculeGraph(atoms, bonds=[])] + std = 0.2 + devices = ["cpu"] + if torch.cuda.is_available(): + devices.append("cuda") + + dist = make_position_distribution(mols, box_borders=box_borders, std=std) + + for device in devices: + dist_d = torch.from_numpy(dist).to(device) + + for method in ["zncc", "mad", "msd", "mad_norm", "msd_norm"]: + if method == "zncc" and device == "cuda": + with pytest.raises(NotImplementedError): + find_gaussian_peaks(dist_d, box_borders, method=method) + continue + + peaks, _, _ = find_gaussian_peaks(dist_d, match_threshold=0.5, box_borders=box_borders, std=std, method=method) + peaks = peaks[0].cpu().numpy() + + # The matched position could be in different order than the original (and undeterministic order on cuda), + # so we test that the minimum matching distance is close to zero. + d = cdist(peaks, atoms[:, :3]).min(axis=1) + assert np.allclose(d, 0.0) + + +def test_shift_mols_window(): + from mlspm.graph import shift_mols_window, MoleculeGraph + + atoms = [np.array([[1.0, 1.0, 0.0, 0], [3.0, 3.0, 1.0, 0]]), np.array([[2.0, 4.0, 1.0, 0], [1.0, 5.0, 1.0, 0]])] + + scan_windows = np.array( + [ + [[0.0, 0.0, 0.0], [4.0, 4.0, 1.0]], + [[1.0, 3.0, 0.0], [5.0, 7.0, 1.0]], + ] + ) + molecules = [MoleculeGraph(atoms[0], []), MoleculeGraph(atoms[1], [])] + + new_molecules, new_scan_window = shift_mols_window(molecules, scan_windows, start=(2, 2)) + + assert np.allclose(new_molecules[0].array(xyz=True), np.array([[3.0, 3.0, 0.0], [5.0, 5.0, 1.0]])) + assert np.allclose(new_molecules[1].array(xyz=True), np.array([[3.0, 3.0, 1.0], [2.0, 4.0, 1.0]])) + assert np.allclose(new_scan_window, np.array([[2.0, 2.0], [6.0, 6.0]])) + + +def test_crop_graph(): + from mlspm.graph import crop_graph, MoleculeGraph + + afm = [np.arange(32).reshape(1, 4, 4, 2)] + print(afm[0][0, :, :, 0]) + print(afm[0][0, :, :, 1]) + mols = [ + MoleculeGraph( + np.array( + [ + [0.0, 0.0, 0.0, 0], + [0.5, 0.5, 0.5, 0], + [1.1, 1.1, 1.0, 0], + ] + ), + [], + ) + ] + start = (1, 1) + size = (2, 2) + box_borders = [[0.0, 0.0, 0.0], [1.5, 1.5, 1.0]] + new_start = (1.0, 1.0) + + afm_cropped, mols_cropped, box_borders_cropped = crop_graph( + afm, mols, start=start, size=size, box_borders=box_borders, new_start=new_start + ) + + assert np.allclose(afm_cropped[0][0, :, :, 0], np.array([[10, 12], [18, 20]])) + assert np.allclose(afm_cropped[0][0, :, :, 1], np.array([[11, 13], [19, 21]])) + assert np.allclose(mols_cropped[0].array(xyz=True), np.array([[1.0, 1.0, 0.5]])) + assert np.allclose(box_borders_cropped, np.array([[1.0, 1.0, 0.0], [1.5, 1.5, 1.0]])) + + +def test_save_graphs_to_xyzs(): + from mlspm.graph import save_graphs_to_xyzs, MoleculeGraph + from mlspm.utils import read_xyzs + + classes = [[1], [6, 8]] + mols = [ + MoleculeGraph( + np.array( + [ + [0.0, 0.0, 0.0, 1], + [0.5, 0.5, 0.5, 6], + [1.0, 1.0, 1.0, 8], + ] + ), + [], + classes=classes, + ) + ] + + save_graphs_to_xyzs(mols, classes=classes) + xyz = read_xyzs(["0_graph.xyz"])[0] + + os.remove("0_graph.xyz") + + assert np.allclose( + xyz, + np.array( + [ + [0.0, 0.0, 0.0, 1], + [0.5, 0.5, 0.5, 6], + [1.0, 1.0, 1.0, 6], + ] + ), + ) + +def test_make_box_borders(): + + from mlspm.graph import make_box_borders + + box_borders = make_box_borders(shape=(100, 100), res=(0.1, 0.2), z_range=(0.0, 1.0)) + assert np.allclose(box_borders, np.array([[0.0, 0.0, 0.0], [9.9, 19.8, 1.0]])) diff --git a/tests/test_logging.py b/tests/test_logging.py new file mode 100644 index 0000000..74ad942 --- /dev/null +++ b/tests/test_logging.py @@ -0,0 +1,34 @@ + +import os +import numpy as np + +def test_loss_log_plot(): + from mlspm.logging import LossLogPlot + + loss_log_path = "loss_log.csv" + plot_path = "test_plot.png" + log_path = "test_log.txt" + + info_log = open(log_path, 'w') + + if os.path.exists(loss_log_path): + os.remove(loss_log_path) + loss_log = LossLogPlot(loss_log_path, plot_path, loss_labels=["1", "2"], loss_weights=["", ""], stream=info_log) + + losses = [[[0, 1, 2], [1, 2, 3]], [[1.5, 2.0, 3.0], [0.1, 0.4, 0.7]]] + for loss in losses: + loss_log.add_train_loss(loss[0]) + loss_log.add_val_loss(loss[1]) + loss_log.next_epoch() + + new_log = LossLogPlot(loss_log_path, "plot.png", loss_labels=["1", "2"], loss_weights=["", ""], stream=info_log) + + info_log.close() + + os.remove(loss_log_path) + os.remove(plot_path) + os.remove(log_path) + + assert new_log.epoch == 2 + assert np.allclose(new_log.train_losses, np.array([[0.75, 1.5, 2.5]])), new_log.train_losses + assert np.allclose(new_log.val_losses, np.array([[0.55, 1.2, 1.85]])), new_log.val_losses diff --git a/tests/test_losses.py b/tests/test_losses.py index 757042d..9fd19dd 100644 --- a/tests/test_losses.py +++ b/tests/test_losses.py @@ -1,5 +1,4 @@ import torch -import numpy as np def test_EqGraphLoss(): diff --git a/tests/test_models.py b/tests/test_models.py index 03e785f..916c888 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -165,13 +165,13 @@ def test_GraphImgNet(): # Test translational invariance of the MPNN pos += torch.Tensor([1.0, 1.0, 1.0]).to(device) node_features_shifted, edge_features_shifted = model.mpnn(pos, x_afm, edges_combined) - assert torch.allclose(node_features, node_features_shifted) - assert torch.allclose(edge_features, edge_features_shifted) + assert torch.allclose(node_features, node_features_shifted, rtol=1e-4, atol=1e-6) + assert torch.allclose(edge_features, edge_features_shifted, rtol=1e-4, atol=1e-6) # Test that the edges are not directional node_features_reverse, edge_features_reverse = model.mpnn(pos, x_afm, edges_combined[[1, 0]]) - assert torch.allclose(node_features, node_features_reverse) - assert torch.allclose(edge_features, edge_features_reverse) + assert torch.allclose(node_features, node_features_reverse, rtol=1e-4, atol=1e-6) + assert torch.allclose(edge_features, edge_features_reverse, rtol=1e-4, atol=1e-6) # Test whole model model.afm_cutoff = 0.8 diff --git a/tests/test_utils.py b/tests/test_utils.py index 03fdd4a..18b14d1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,10 @@ import os +from pathlib import Path +import shutil + import numpy as np +from torch import nn, optim +import torch def test_xyz_read_write(): @@ -22,27 +27,38 @@ def test_xyz_read_write(): assert np.allclose(xyz, xyz_read) -def test_loss_log_plot(): - from mlspm.logging import LossLogPlot +def test_checkpoints(): + from mlspm.utils import load_checkpoint, save_checkpoint + + save_dir = Path("test_checkpoints") + + model = nn.Linear(10, 10) + optimizer = optim.Adam(model.parameters()) + lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda b: 1.0 / (1.0 + 1e-5 * b)) + additional_data = {"test_data": 3, "lr_scheduler": lr_scheduler} - log_path = "loss_log.csv" - plot_path = "test_plot.png" + x, y = np.random.rand(2, 1, 10, 10) + x = torch.from_numpy(x).float() + y = torch.from_numpy(y).float() + pred = model(x) + loss = ((y - pred) ** 2).mean() + loss.backward() + optimizer.step() + lr_scheduler.step() + print(loss) - if os.path.exists(log_path): - os.remove(log_path) - loss_log = LossLogPlot(log_path, plot_path, loss_labels=["1", "2"], loss_weights=["", ""]) + save_checkpoint(model, optimizer, epoch=1, save_dir=save_dir, additional_data=additional_data) - losses = [[[0, 1, 2], [1, 2, 3]], [[1.5, 2.0, 3.0], [0.1, 0.4, 0.7]]] - for loss in losses: - loss_log.add_train_loss(loss[0]) - loss_log.add_val_loss(loss[1]) - loss_log.next_epoch() + model_new = nn.Linear(10, 10) + optimizer_new = optim.Adam(model.parameters()) + lr_scheduler_new = optim.lr_scheduler.LambdaLR(optimizer, lambda b: 1.0 / (1.0 + 1e-5 * b)) + additional_data = {"test_data": 0, "lr_scheduler": lr_scheduler_new} - new_log = LossLogPlot(log_path, "plot.png", loss_labels=["1", "2"], loss_weights=["", ""]) + load_checkpoint(model_new, optimizer_new, file_name=save_dir / "model_1.pth", additional_data=additional_data) - os.remove(log_path) - os.remove(plot_path) + assert np.allclose(model.state_dict()["weight"], model_new.state_dict()["weight"]) + assert np.allclose(optimizer.state_dict()["state"][0]["exp_avg"], optimizer_new.state_dict()["state"][0]["exp_avg"]) + assert np.allclose(lr_scheduler.state_dict()["_last_lr"], lr_scheduler_new.state_dict()["_last_lr"]) + assert np.allclose(additional_data["test_data"], 3) - assert new_log.epoch == 2 - assert np.allclose(new_log.train_losses, np.array([[0.75, 1.5, 2.5]])), new_log.train_losses - assert np.allclose(new_log.val_losses, np.array([[0.55, 1.2, 1.85]])), new_log.val_losses + shutil.rmtree(save_dir) diff --git a/tests/test_visualization.py b/tests/test_visualization.py new file mode 100755 index 0000000..3a60189 --- /dev/null +++ b/tests/test_visualization.py @@ -0,0 +1,88 @@ +import shutil +from pathlib import Path + +import numpy as np + + +def test_make_input_plots(): + + from mlspm.visualization import make_input_plots + + save_dir = Path('test_input_plots') + + X = [np.random.rand(2, 20, 20, 2), np.random.rand(2, 20, 20, 2)] + make_input_plots(X, outdir=save_dir) + + assert len(list(save_dir.glob('*.png'))) == 4 + + shutil.rmtree(save_dir) + +def test_plot_graphs(): + + from mlspm.graph import MoleculeGraph + from mlspm.visualization import plot_graphs + + save_dir = Path('test_graph_plots') + + # fmt:off + atoms = [ + np.array([ + [0.1, 0.2, 0.3, 1], + [0.5, 0.6, 0.7, 1], + [0.9, 1.0, 1.1, 6], + [1.3, 1.4, 1.5, 6] + ]), + np.array([ + [1.7, 1.8, 1.9, 1], + [2.1, 2.2, 2.3, 6] + ]), + np.empty((0, 4)), + np.array([ + [2.5, 2.6, 2.7, 1], + [2.9, 3.0, 3.1, 6], + [3.3, 3.4, 3.5, 6] + ]), + ] + # fmt:on + bonds = [ + [(0,1), (0,2), (1,3), (2,3)], + [(0,1)], + [], + [(0,1), (1,2)] + ] + classes = [[1], [6]] + mols = [MoleculeGraph(a, b, classes) for a, b in zip(atoms, bonds)] + box_borders = np.array([[0.0, 0.0, 0.0], [4.0, 4.0, 4.0]]) + + plot_graphs(mols, mols, box_borders=box_borders, classes=classes, outdir=save_dir) + + assert len(list(save_dir.glob('*.png'))) == 4 + + shutil.rmtree(save_dir) + +def test_plot_distribution_grid(): + + from mlspm.graph import make_position_distribution, MoleculeGraph + from mlspm.visualization import plot_distribution_grid + + save_dir = Path('test_distribution_plots') + + # fmt:off + atoms = np.array([ + [2.0, 2.0, 1.0, 0], + [2.0, 6.0, 2.0, 0], + [6.0, 4.0, 2.0, 0], + [4.0, 2.0, 2.0, 0] + ]) + # fmt:on + box_borders = np.array([[0.0, 0.0, 0.0], [8.0, 8.0, 3.0]]) + mols = [MoleculeGraph(atoms, bonds=[])] + std = 0.2 + + dist = make_position_distribution(mols, box_borders=box_borders, std=std) + + plot_distribution_grid(dist, dist, box_borders=box_borders, outdir=save_dir) + + assert len(list(save_dir.glob('*.png'))) == 2 + + shutil.rmtree(save_dir)