From 283ab05a5ffe61d4b14674371ed6390caed560dd Mon Sep 17 00:00:00 2001 From: David Nabergoj Date: Tue, 3 Sep 2024 17:35:50 +0200 Subject: [PATCH] Add maximum batch size limit in megabytes for adaptive batch size; add divergence checks for SVI --- torchflows/flows.py | 79 +++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/torchflows/flows.py b/torchflows/flows.py index 22bde3c..8bb31bb 100644 --- a/torchflows/flows.py +++ b/torchflows/flows.py @@ -79,7 +79,8 @@ def fit(self, context_val: torch.Tensor = None, keep_best_weights: bool = True, early_stopping: bool = False, - early_stopping_threshold: int = 50): + early_stopping_threshold: int = 50, + max_batch_size_mb: int = 2000): """Fit the normalizing flow to a dataset. Fitting the flow means finding the parameters of the bijection that maximize the probability of training data. @@ -100,6 +101,7 @@ def fit(self, :param keep_best_weights: if True and validation data is provided, keep the bijection weights with the highest probability of validation data. :param early_stopping: if True and validation data is provided, stop the training procedure early once validation loss stops improving for a specified number of consecutive epochs. :param early_stopping_threshold: if early_stopping is True, fitting stops after no improvement in validation loss for this many epochs. + :param int max_batch_size_mb: maximum batch size in megabytes. """ if len(list(self.parameters())) == 0: # If the flow has no trainable parameters, do nothing @@ -114,6 +116,10 @@ def fit(self, elif isinstance(batch_size, str) and batch_size == "adaptive": min_batch_size = max(32, min(1024, len(x_train) // 100)) max_batch_size = min(4096, len(x_train) // 10) + + event_size_mb = self.event_size / 2 ** 20 + max_batch_size = max(1, min(max_batch_size, int(max_batch_size_mb / event_size_mb))) + batch_size_adaptation_interval = 10 # double the batch size every 10 epochs adaptive_batch_size = True batch_size = min_batch_size @@ -290,42 +296,53 @@ def variational_fit(self, print('Flow training diverged') print('Reverting to initial weights') break - - optimizer.zero_grad() - flow_x, flow_log_prob = self.sample(n_samples, return_log_prob=True) - target_log_prob_value = target_log_prob(flow_x) - loss = -torch.mean(target_log_prob_value + flow_log_prob) - loss += self.regularization() - epoch_diverged = False - if check_for_divergences: - if not torch.isfinite(loss): - epoch_diverged = True - if torch.max(torch.abs(flow_x)) > 1e8: - epoch_diverged = True - elif torch.max(torch.abs(flow_log_prob)) > 1e6: - epoch_diverged = True - elif torch.any(~torch.isfinite(flow_x)): - epoch_diverged = True - elif torch.any(~torch.isfinite(flow_log_prob)): - epoch_diverged = True - n_divergences += epoch_diverged + optimizer.zero_grad() - if not epoch_diverged: - loss.backward() - optimizer.step() - if loss < best_loss: - best_loss = loss - best_epoch = epoch - if keep_best_weights: - best_weights = deepcopy(self.state_dict()) - else: + try: + flow_x, flow_log_prob = self.sample(n_samples, return_log_prob=True) + target_log_prob_value = target_log_prob(flow_x) + loss = -torch.mean(target_log_prob_value + flow_log_prob) + loss += self.regularization() + + if check_for_divergences: + if not torch.isfinite(loss): + epoch_diverged = True + if torch.max(torch.abs(flow_x)) > 1e8: + epoch_diverged = True + elif torch.max(torch.abs(flow_log_prob)) > 1e6: + epoch_diverged = True + elif torch.any(~torch.isfinite(flow_x)): + epoch_diverged = True + elif torch.any(~torch.isfinite(flow_log_prob)): + epoch_diverged = True + + if not epoch_diverged: + loss.backward() + optimizer.step() + if loss < best_loss: + best_loss = loss + best_epoch = epoch + if keep_best_weights: + best_weights = deepcopy(self.state_dict()) + mean_flow_log_prob = flow_log_prob.mean() + mean_target_log_prob = target_log_prob_value.mean() + else: + loss = torch.nan + mean_flow_log_prob = torch.nan + mean_target_log_prob = torch.nan + except ValueError: + epoch_diverged = True loss = torch.nan + mean_flow_log_prob = torch.nan + mean_target_log_prob = torch.nan + + n_divergences += epoch_diverged pbar.set_postfix_str(f'Loss: {loss:.4f} [best: {best_loss:.4f} @ {best_epoch}], ' f'divergences: {n_divergences}, ' - f'flow log_prob: {flow_log_prob.mean():.2f}, ' - f'target log_prob: {target_log_prob_value.mean():.2f}') + f'flow log_prob: {mean_flow_log_prob:.2f}, ' + f'target log_prob: {mean_target_log_prob:.2f}') if epoch - best_epoch > early_stopping_threshold and early_stopping: break