diff --git a/experiments/amplitudes/plots.py b/experiments/amplitudes/plots.py index 8a82c403..c30a3361 100644 --- a/experiments/amplitudes/plots.py +++ b/experiments/amplitudes/plots.py @@ -37,7 +37,7 @@ def plot_mixer(cfg, plot_path, title, plot_dict): for idataset, dataset in enumerate(cfg.data.dataset): data = [ np.log(plot_dict["results_test"][dataset]["raw"]["truth"]), - np.log(plot_dict["results_test"][dataset]["raw"]["truth"]), + np.log(plot_dict["results_train"][dataset]["raw"]["truth"]), np.log(plot_dict["results_test"][dataset]["raw"]["prediction"]), ] plot_histograms( diff --git a/experiments/base_experiment.py b/experiments/base_experiment.py index 41a735fd..10916515 100644 --- a/experiments/base_experiment.py +++ b/experiments/base_experiment.py @@ -10,6 +10,7 @@ from hydra.utils import instantiate import mlflow from torch_ema import ExponentialMovingAverage +import pytorch_optimizer import gatr.primitives.attention import gatr.layers.linear @@ -23,9 +24,6 @@ from gatr.layers import MLPConfig, SelfAttentionConfig -from lion_pytorch import Lion -import schedulefree - cs = ConfigStore.instance() cs.store(name="base_attention", node=SelfAttentionConfig) cs.store(name="base_mlp", node=MLPConfig) @@ -61,7 +59,7 @@ def run_mlflow(self): experiment_id, run_name = self._init() git_hash = os.popen("git rev-parse HEAD").read().strip() LOGGER.info( - f"### Starting experiment {self.cfg.exp_name}/{run_name} (mlflowid={experiment_id}) (jobid={self.cfg.jobid}) (git_hash={git_hash} ###" + f"### Starting experiment {self.cfg.exp_name}/{run_name} (mlflowid={experiment_id}) (jobid={self.cfg.jobid}) (git_hash={git_hash}) ###" ) if self.cfg.use_mlflow: with mlflow.start_run(experiment_id=experiment_id, run_name=run_name): @@ -388,13 +386,13 @@ def _init_optimizer(self, param_groups=None): weight_decay=self.cfg.training.weight_decay, ) elif self.cfg.training.optimizer == "Lion": - self.optimizer = Lion( + self.optimizer = pytorch_optimizer.Lion( param_groups, betas=self.cfg.training.betas, weight_decay=self.cfg.training.weight_decay, ) - elif self.cfg.training.optimizer == "ScheduleFree": - self.optimizer = schedulefree.AdamWScheduleFree( + elif self.cfg.training.optimizer == "ADOPT": + self.optimizer = pytorch_optimizer.ADOPT( param_groups, betas=self.cfg.training.betas, weight_decay=self.cfg.training.weight_decay, @@ -501,8 +499,6 @@ def cycle(iterable): for step in range(self.cfg.training.iterations): # training self.model.train() - if self.cfg.training.optimizer == "ScheduleFree": - self.optimizer.train() data = next(iterator) self._step(data, step) @@ -581,7 +577,7 @@ def _step(self, data, step): grad_norm = ( torch.nn.utils.clip_grad_norm_( self.model.parameters(), - self.cfg.training.clip_grad_norm, + self.cfg.training.clip_grad_norm if self.cfg.training.clip_grad_norm is not None else float('inf'), error_if_nonfinite=False, ) .cpu() @@ -631,8 +627,6 @@ def _validate(self, step): metrics = self._init_metrics() self.model.eval() - if self.cfg.training.optimizer == "ScheduleFree": - self.optimizer.eval() with torch.no_grad(): for data in self.val_loader: # use EMA for validation if available diff --git a/experiments/baselines/dsi.py b/experiments/baselines/dsi.py index 5dcd1789..48732cc8 100644 --- a/experiments/baselines/dsi.py +++ b/experiments/baselines/dsi.py @@ -115,16 +115,18 @@ def __init__( # (could evaluate them pre training, # but we have large batchsizes # so no big difference expected) - self.inv_mean, self.inv_std = None, None + n_invariants = n * (n + 1) // 2 if self.use_invariants else 0 + self.register_buffer("inv_inited", torch.tensor(False, dtype=torch.bool)) + self.register_buffer("inv_mean", torch.zeros(1, 1, n_invariants)) + self.register_buffer("inv_std", torch.ones(1, 1, n_invariants)) def _compute_invariants(self, particles): invariants = compute_invariants(particles) # standardize - if self.inv_mean is None or self.inv_std is None: + if not self.inv_inited: self.inv_mean = invariants.mean(dim=-2, keepdim=True) - self.inv_std = invariants.std(dim=-2, keepdim=True) - self.inv_std = self.inv_std.clamp(min=1e-5) + self.inv_std = invariants.std(dim=-2, keepdim=True).clamp(min=1e-5) invariants = (invariants - self.inv_mean) / self.inv_std return invariants diff --git a/experiments/misc.py b/experiments/misc.py index 9295f1ff..2e7b76cf 100644 --- a/experiments/misc.py +++ b/experiments/misc.py @@ -2,10 +2,6 @@ import torch -class NaNError(BaseException): - """Exception to be raise when the training encounters a NaN in loss or model weights.""" - - def get_device() -> torch.device: """Gets CUDA if available, CPU else.""" return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") diff --git a/experiments/tagging/embedding.py b/experiments/tagging/embedding.py index 20a80c15..da2e54db 100644 --- a/experiments/tagging/embedding.py +++ b/experiments/tagging/embedding.py @@ -90,17 +90,12 @@ def embed_tagging_data_into_ga(fourmomenta, scalars, ptr, cfg_data): fourmomenta.dtype, ) n_spurions = spurions.shape[0] - if cfg_data.beam_token: + if cfg_data.beam_token and n_spurions > 0: # prepend spurions to the token list (within each block) - spurion_idxs = torch.cat( - [ - torch.arange( - ptr_start + i * n_spurions, - ptr_start + (i + 1) * n_spurions, - ) - for i, ptr_start in enumerate(ptr[:-1]) - ] - ) + spurion_idxs = torch.stack( + [ptr[:-1] + i for i in range(n_spurions)], dim=0 + ) + n_spurions * torch.arange(batchsize, device=ptr.device) + spurion_idxs = spurion_idxs.permute(1, 0).flatten() insert_spurion = torch.zeros( multivectors.shape[0] + n_spurions * batchsize, dtype=torch.bool, @@ -131,18 +126,13 @@ def embed_tagging_data_into_ga(fourmomenta, scalars, ptr, cfg_data): multivectors = torch.cat((multivectors, spurions), dim=-2) # global tokens - if cfg_data.include_global_token: + num_global_tokens = cfg_data.num_global_tokens + if cfg_data.include_global_token and num_global_tokens > 0: # prepend global tokens to the token list - num_global_tokens = cfg_data.num_global_tokens - global_idxs = torch.cat( - [ - torch.arange( - ptr_start + i * num_global_tokens, - ptr_start + (i + 1) * num_global_tokens, - ) - for i, ptr_start in enumerate(ptr[:-1]) - ] - ) + global_idxs = torch.stack( + [ptr[:-1] + i for i in range(num_global_tokens)], dim=0 + ) + num_global_tokens * torch.arange(batchsize, device=ptr.device) + global_idxs = global_idxs.permute(1, 0).flatten() is_global = torch.zeros( multivectors.shape[0] + batchsize * num_global_tokens, dtype=torch.bool, diff --git a/gatr/layers/mlp/geometric_bilinears.py b/gatr/layers/mlp/geometric_bilinears.py index aebeeffb..061f53f2 100644 --- a/gatr/layers/mlp/geometric_bilinears.py +++ b/gatr/layers/mlp/geometric_bilinears.py @@ -18,7 +18,7 @@ class GeometricBilinear(nn.Module): """Geometric bilinear layer. Pin-equivariant map between multivector tensors that constructs new geometric features via - geometric products and the equivariant join (based on a reference vector). + geometric products. Parameters ---------- diff --git a/gatr/layers/mlp/mlp.py b/gatr/layers/mlp/mlp.py index 7cf7be1e..fbfed722 100644 --- a/gatr/layers/mlp/mlp.py +++ b/gatr/layers/mlp/mlp.py @@ -18,7 +18,7 @@ class GeoMLP(nn.Module): """Geometric MLP. This is a core component of GATr's transformer blocks. It is similar to a regular MLP, except - that it uses geometric bilinears (GP and equivariant join) in place of the first linear layer. + that it uses geometric bilinears (the geometric product) in place of the first linear layer. Assumes input has shape `(..., channels[0], 16)`, output has shape `(..., channels[-1], 16)`, will create hidden layers with shape `(..., channel, 16)` for each additional entry in diff --git a/requirements.txt b/requirements.txt index f1f6a380..d62e9aaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,21 +4,22 @@ black==22.3.0 ## Standard science stack h5py +hdf5plugin matplotlib -numpy<1.25 +numpy>=1.25.0,<=1.26.4 # numpy>=1.25.0 is needed for pytorch-optimizer, numpy<=1.26.4 is needed for numba scipy ## Standard utils tqdm +wget ## Deep learning einops -git+https://github.com/dgasmith/opt_einsum.git@1a984b7b75f3e532e7129f6aa13f7ddc3da66e10 # Un-released latest master (pinned for repro); the latest released version 3.3.0 has bugs in shape-based contraction path computations (used for einsum) +opt_einsum torch==2.2.0 torch_geometric==2.4.0 torch-ema -lion-pytorch==0.1.4 -schedulefree==1.2.5 +git+https://github.com/kozistr/pytorch_optimizer # most recent pytorch_optimizer contains ADOPT; replace this with simply 'pytorch_optimizer>=3.2.0' after the next release ## Experiment management hydra-core @@ -28,7 +29,7 @@ optuna ## Project-specific: general clifford==1.4.0 numba==0.58.1 # otherwise has conflicts -xformers==0.0.24 +xformers==0.0.24 # there exists known compatibility issues with xformers on Mac systems torchdiffeq # for CFM sampling #weaver packages diff --git a/tests/gatr/primitives/test_nonlinearities.py b/tests/gatr/primitives/test_nonlinearities.py index bb1e61cb..2b41bf66 100644 --- a/tests/gatr/primitives/test_nonlinearities.py +++ b/tests/gatr/primitives/test_nonlinearities.py @@ -4,13 +4,11 @@ import torch from gatr.primitives import gated_relu, gated_sigmoid -from gatr.primitives.nonlinearities import gated_gelu, gated_gelu_divide +from gatr.primitives.nonlinearities import gated_gelu from tests.helpers import BATCH_DIMS, TOLERANCES, check_pin_equivariance -@pytest.mark.parametrize( - "fn", [gated_relu, gated_gelu, gated_sigmoid, gated_gelu_divide] -) +@pytest.mark.parametrize("fn", [gated_relu, gated_gelu, gated_sigmoid]) @pytest.mark.parametrize("batch_dims", BATCH_DIMS) def test_gated_nonlin_equivariance(fn, batch_dims): """Tests an identity map for equivariance (testing the test).""" @@ -18,12 +16,3 @@ def test_gated_nonlin_equivariance(fn, batch_dims): check_pin_equivariance( fn, 1, fn_kwargs=dict(gates=gates), batch_dims=batch_dims, **TOLERANCES ) - - -def test_gelu_division(): - """Test that gated_gelu_divide() is equal to gelu(x)/x.""" - values = torch.randn(2, 3, 4) - gates = torch.randn(2, 3, 4) - y1 = gated_gelu(values, gates) / gates - y2 = gated_gelu_divide(values, gates) - torch.testing.assert_close(y1, y2) diff --git a/tests_regression/regression_datasets/particle_mass.py b/tests_regression/regression_datasets/particle_mass.py index 4f5b9db9..082b37f8 100644 --- a/tests_regression/regression_datasets/particle_mass.py +++ b/tests_regression/regression_datasets/particle_mass.py @@ -1,6 +1,5 @@ import torch -from experiments.base_wrapper import BaseWrapper from gatr.interface import embed_vector, extract_scalar from tests_regression.regression_datasets.constants import DATASET_SIZE, DEVICE @@ -31,7 +30,7 @@ def __getitem__(self, idx): return self.particle[idx], self.mass[idx] -class ParticleMassWrapper(BaseWrapper): +class ParticleMassWrapper(torch.nn.Module): """Wrapper around GATr networks for PointsDistanceDataset.""" mv_in_channels = 1 @@ -42,7 +41,34 @@ class ParticleMassWrapper(BaseWrapper): raw_out_channels = 1 def __init__(self, net): - super().__init__(net, scalars=True, return_other=False) + super().__init__() + self.net = net + + def forward(self, inputs: torch.Tensor): + """Wrapped forward pass pass. + + Parses inputs into GA + scalar representation, calls the forward pass of the wrapped net, + and extracts the outputs from the GA + scalar representation again. + + Parameters + ---------- + inputs : torch.Tensor + Raw inputs, as given by dataset. + + Returns + ------- + outputs : torch.Tensor + Raw outputs, as expected in dataset. + """ + + multivector, scalars = self.embed_into_ga(inputs) + multivector_outputs, scalar_outputs = self.net( + multivector, + scalars=scalars, + ) + outputs = self.extract_from_ga(multivector_outputs, scalar_outputs) + + return outputs def embed_into_ga(self, inputs): """Embeds raw inputs into the geometric algebra (+ scalar) representation. @@ -89,8 +115,6 @@ def extract_from_ga(self, multivector, scalars): ------- outputs : torch.Tensor Raw outputs, as expected in dataset. - other : torch.Tensor - Additional output data, e.g. required for regularization. """ _, num_objects, num_channels, num_ga_components = multivector.shape @@ -99,4 +123,4 @@ def extract_from_ga(self, multivector, scalars): assert num_ga_components == 16 norm = extract_scalar(multivector[:, :, 0, :]) # (batchsize, 1, 1) - return norm, None + return norm diff --git a/tests_regression/regression_datasets/top_reconstruction.py b/tests_regression/regression_datasets/top_reconstruction.py index 25f70724..4c88cb96 100644 --- a/tests_regression/regression_datasets/top_reconstruction.py +++ b/tests_regression/regression_datasets/top_reconstruction.py @@ -1,11 +1,8 @@ import torch -from experiments.base_wrapper import BaseWrapper from gatr.interface import embed_vector, extract_vector from tests_regression.regression_datasets.constants import DATASET_SIZE, DEVICE -import matplotlib.pyplot as plt # testing - class TopReconstructionDataset(torch.utils.data.Dataset): """Toy dataset for reconstruction the mass of a top quark and W boson from @@ -169,18 +166,43 @@ def __getitem__(self, idx): return self.event[idx], self.reco[idx] -class TopReconstructionWrapper(BaseWrapper): +class TopReconstructionWrapper(torch.nn.Module): """Wrapper around GATr networks for TopReconstructionDataset.""" mv_in_channels = 1 mv_out_channels = 2 s_in_channels = 1 s_out_channels = 1 - # raw_in_channels = 4 - # raw_out_channels = 1 def __init__(self, net): - super().__init__(net, scalars=True, return_other=False) + super().__init__() + self.net = net + + def forward(self, inputs: torch.Tensor): + """Wrapped forward pass pass. + + Parses inputs into GA + scalar representation, calls the forward pass of the wrapped net, + and extracts the outputs from the GA + scalar representation again. + + Parameters + ---------- + inputs : torch.Tensor + Raw inputs, as given by dataset. + + Returns + ------- + outputs : torch.Tensor + Raw outputs, as expected in dataset. + """ + + multivector, scalars = self.embed_into_ga(inputs) + multivector_outputs, scalar_outputs = self.net( + multivector, + scalars=scalars, + ) + outputs = self.extract_from_ga(multivector_outputs, scalar_outputs) + + return outputs def embed_into_ga(self, inputs): """Embeds raw inputs into the geometric algebra (+ scalar) representation. @@ -207,8 +229,8 @@ def embed_into_ga(self, inputs): multivector = multivector.unsqueeze(2) # (batchsize, 3, 1, 16) scalars = torch.zeros( - (batchsize, 1, 1), device=inputs.device - ) # (batchsize, 1, 1) + (batchsize, 3, 1), device=inputs.device + ) # (batchsize, 3, 1) return multivector, scalars def extract_from_ga(self, multivector, scalars): @@ -227,8 +249,6 @@ def extract_from_ga(self, multivector, scalars): ------- outputs : torch.Tensor Raw outputs, as expected in dataset. - other : torch.Tensor - Additional output data, e.g. required for regularization. """ _, num_objects, num_channels, num_ga_components = multivector.shape @@ -241,4 +261,4 @@ def extract_from_ga(self, multivector, scalars): reco = torch.stack((pt, pW), dim=1) reco = reco[:, :, 0, :] # pick first output channel # reco = reco.mean(dim=2) # average over output channels (much worse, probably because mean breaks symmetry) - return reco, None + return reco diff --git a/tests_regression/test_regression.py b/tests_regression/test_regression.py index 1cbceece..b244d843 100644 --- a/tests_regression/test_regression.py +++ b/tests_regression/test_regression.py @@ -43,7 +43,8 @@ def gatr_factory(wrapper_class): @pytest.mark.parametrize("model_factory", [gatr_factory], ids=["GATr"]) @pytest.mark.parametrize( "data,wrapper_class", - [ # (ParticleMassDataset(), ParticleMassWrapper), + [ + (ParticleMassDataset(), ParticleMassWrapper), (TopReconstructionDataset(), TopReconstructionWrapper), ], )