From 6eec626552b6c27c552434ccec1fba825737b391 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 20 Dec 2023 19:01:49 +0100
Subject: [PATCH 01/30] Fix device

---
 .../finite/autoregressive/conditioner_transforms.py           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioner_transforms.py b/normalizing_flows/bijections/finite/autoregressive/conditioner_transforms.py
index 608bcc5..8ff8306 100644
--- a/normalizing_flows/bijections/finite/autoregressive/conditioner_transforms.py
+++ b/normalizing_flows/bijections/finite/autoregressive/conditioner_transforms.py
@@ -83,12 +83,12 @@ def forward(self, x: torch.Tensor, context: torch.Tensor = None):
         else:
             if self.n_global_parameters == self.n_transformer_parameters:
                 # All transformer parameters are learned globally
-                output = torch.zeros(*batch_shape, *self.parameter_shape)
+                output = torch.zeros(*batch_shape, *self.parameter_shape, device=x.device)
                 output[..., self.global_parameter_mask] = self.global_theta_flat
                 return output
             else:
                 # Some transformer parameters are learned globally, some are predicted
-                output = torch.zeros(*batch_shape, *self.parameter_shape)
+                output = torch.zeros(*batch_shape, *self.parameter_shape, device=x.device)
                 output[..., self.global_parameter_mask] = self.global_theta_flat
                 output[..., ~self.global_parameter_mask] = self.predict_theta_flat(x, context)
                 return output

From 76db053a7f115b47183db24a82e91f9b37951f1b Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 20 Dec 2023 19:06:30 +0100
Subject: [PATCH 02/30] Fix weight shape

---
 normalizing_flows/flows.py | 8 +++++---
 normalizing_flows/utils.py | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/normalizing_flows/flows.py b/normalizing_flows/flows.py
index ac5f5d5..43f949b 100644
--- a/normalizing_flows/flows.py
+++ b/normalizing_flows/flows.py
@@ -166,7 +166,8 @@ def fit(self,
             context_train,
             "training",
             batch_size=batch_size,
-            shuffle=shuffle
+            shuffle=shuffle,
+            event_shape=self.bijection.event_shape
         )
 
         # Process validation data
@@ -177,7 +178,8 @@ def fit(self,
                 context_val,
                 "validation",
                 batch_size=batch_size,
-                shuffle=shuffle
+                shuffle=shuffle,
+                event_shape=self.bijection.event_shape
             )
 
             best_val_loss = torch.inf
@@ -190,7 +192,7 @@ def compute_batch_loss(batch_, reduction: callable = torch.mean):
 
             batch_log_prob = self.log_prob(batch_x.to(self.loc), context=batch_context)
             batch_weights = batch_weights.to(self.loc)
-            assert batch_log_prob.shape == batch_weights.shape
+            assert batch_log_prob.shape == batch_weights.shape, f"{batch_log_prob.shape = }, {batch_weights.shape = }"
             batch_loss = -reduction(batch_log_prob * batch_weights) / n_event_dims
 
             return batch_loss
diff --git a/normalizing_flows/utils.py b/normalizing_flows/utils.py
index 7a0a5e1..d13a170 100644
--- a/normalizing_flows/utils.py
+++ b/normalizing_flows/utils.py
@@ -190,6 +190,7 @@ def create_data_loader(x: torch.Tensor,
                        weights: Optional[torch.Tensor],
                        context: Optional[torch.Tensor],
                        label: str,
+                       event_shape,
                        **kwargs):
     """
     Creates a DataLoader object for NF training.
@@ -198,7 +199,7 @@ def create_data_loader(x: torch.Tensor,
 
     # Set default weights
     if weights is None:
-        weights = torch.ones(len(x))
+        weights = torch.ones(size=get_batch_shape(x, event_shape))
 
     # Create the training dataset and loader
     if len(x) != len(weights):

From 304092ea20070e73e0bfc5cf96b2c5475b84bb97 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 20 Dec 2023 20:45:56 +0100
Subject: [PATCH 03/30] Change defaults for LRS

---
 .../autoregressive/transformers/spline/linear_rational.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py b/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py
index a80e887..f5ec325 100644
--- a/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py
+++ b/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py
@@ -16,8 +16,8 @@ def __init__(self, event_shape: Union[torch.Size, Tuple[int, ...]], boundary: fl
             max_output=boundary,
             **kwargs
         )
-        self.min_bin_width = 1e-2
-        self.min_bin_height = 1e-2
+        self.min_bin_width = 1e-5
+        self.min_bin_height = 1e-5
         self.min_d = 1e-5
         self.const = math.log(math.exp(1 - self.min_d) - 1)  # to ensure identity initialization
         self.eps = 5e-10  # Epsilon for numerical stability when computing forward/inverse
@@ -83,9 +83,9 @@ def compute_knots(self, u_x, u_y, u_l, u_d):
         # u_y acts as a delta
         # u_d acts as a delta
         knots_x = self.compute_bins(u_x, self.min_input, self.max_input, self.min_bin_width)
-        knots_y = self.compute_bins(u_x + u_y / 1000, self.min_output, self.max_output, self.min_bin_height)
+        knots_y = self.compute_bins(u_x + u_y / 100, self.min_output, self.max_output, self.min_bin_height)
         knots_lambda = torch.sigmoid(u_l)
-        knots_d = self.compute_derivatives(self.const + u_d / 1000)
+        knots_d = self.compute_derivatives(self.const + u_d / 100)
         return knots_x, knots_y, knots_d, knots_lambda
 
     def forward_1d(self, x, h):

From f0167a363b851b89e32e9f8b8a3288fe11abd885 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Thu, 21 Dec 2023 13:54:23 +0100
Subject: [PATCH 04/30] Change default bin size in LRS

---
 .../autoregressive/transformers/spline/linear_rational.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py b/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py
index f5ec325..5dce471 100644
--- a/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py
+++ b/normalizing_flows/bijections/finite/autoregressive/transformers/spline/linear_rational.py
@@ -16,8 +16,8 @@ def __init__(self, event_shape: Union[torch.Size, Tuple[int, ...]], boundary: fl
             max_output=boundary,
             **kwargs
         )
-        self.min_bin_width = 1e-5
-        self.min_bin_height = 1e-5
+        self.min_bin_width = 1e-2
+        self.min_bin_height = 1e-2
         self.min_d = 1e-5
         self.const = math.log(math.exp(1 - self.min_d) - 1)  # to ensure identity initialization
         self.eps = 5e-10  # Epsilon for numerical stability when computing forward/inverse

From 9f22ac81c34b46e6d935f06d35b624a47756c3c4 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Thu, 21 Dec 2023 13:54:39 +0100
Subject: [PATCH 05/30] Reduce number of tests for faster CI/CD

---
 test/constants.py                | 18 +++++++++---------
 test/test_cuda.py                |  4 ++++
 test/test_sigmoid_transformer.py | 32 ++++++++++++--------------------
 test/test_spline.py              |  5 +++--
 test/test_umnn.py                |  5 +++--
 5 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/test/constants.py b/test/constants.py
index 381c918..e64b7d8 100644
--- a/test/constants.py
+++ b/test/constants.py
@@ -1,11 +1,11 @@
 __test_constants = {
-    'batch_shape': [(1,), (2,), (5,), (2, 4), (5, 2, 3)],
-    'event_shape': [(2,), (3,), (2, 4), (40,), (3, 5, 2)],
-    'image_shape': [(4, 4, 3), (20, 20, 3), (10, 20, 3), (200, 200, 3), (20, 20, 1), (10, 20, 1)],
-    'context_shape': [None, (2,), (3,), (2, 4), (5,)],
-    'input_event_shape': [(2,), (3,), (2, 4), (40,), (3, 5, 2)],
-    'output_event_shape': [(2,), (3,), (2, 4), (40,), (3, 5, 2)],
-    'n_predicted_parameters': [1, 2, 10, 50, 100],
-    'predicted_parameter_shape': [(1,), (2,), (5,), (2, 4), (5, 2, 3)],
-    'parameter_shape_per_element': [(1,), (2,), (5,), (2, 4), (5, 2, 3)],
+    'batch_shape': [(1,), (2,), (5,), (5, 2, 3)],
+    'event_shape': [(2,), (3,), (3, 5, 2)],
+    'image_shape': [(4, 4, 3), (20, 20, 3), (10, 20, 3), (20, 20, 1), (10, 20, 1)],
+    'context_shape': [None, (2,), (3,), (3, 5, 2)],
+    'input_event_shape': [(2,), (3,), (3, 5, 2)],
+    'output_event_shape': [(2,), (3,), (3, 5, 2)],
+    'n_predicted_parameters': [1, 2, 10, 50],
+    'predicted_parameter_shape': [(1,), (2,), (5,), (5, 2, 3)],
+    'parameter_shape_per_element': [(1,), (2,), (5,), (5, 2, 3)],
 }
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 9ff3c97..0a81ad3 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4,6 +4,7 @@
 from normalizing_flows import Flow
 
 
+@pytest.mark.skip(reason="Too slow on CI/CD")
 def test_real_nvp_log_prob_data_on_cpu():
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
@@ -19,6 +20,7 @@ def test_real_nvp_log_prob_data_on_cpu():
     flow.log_prob(x_train)
 
 
+@pytest.mark.skip(reason="Too slow on CI/CD")
 def test_real_nvp_log_prob_data_on_gpu():
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
@@ -34,6 +36,7 @@ def test_real_nvp_log_prob_data_on_gpu():
     flow.log_prob(x_train.cuda())
 
 
+@pytest.mark.skip(reason="Too slow on CI/CD")
 def test_real_nvp_fit_data_on_cpu():
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
@@ -49,6 +52,7 @@ def test_real_nvp_fit_data_on_cpu():
     flow.fit(x_train)
 
 
+@pytest.mark.skip(reason="Too slow on CI/CD")
 def test_real_nvp_fit_data_on_gpu():
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
diff --git a/test/test_sigmoid_transformer.py b/test/test_sigmoid_transformer.py
index 9c5599f..3ca84dd 100644
--- a/test/test_sigmoid_transformer.py
+++ b/test/test_sigmoid_transformer.py
@@ -5,10 +5,11 @@
 from normalizing_flows.bijections import DSCoupling, CouplingDSF
 from normalizing_flows.bijections.finite.autoregressive.transformers.combination.sigmoid import Sigmoid, DeepSigmoid
 from normalizing_flows.bijections.base import invert
+from test.constants import __test_constants
 
 
-@pytest.mark.parametrize('batch_shape', [(7,), (25,), (13,), (2, 37)])
-@pytest.mark.parametrize('event_shape', [(5,), (1,), (100,), (3, 56, 2)])
+@pytest.mark.parametrize('batch_shape', __test_constants['batch_shape'])
+@pytest.mark.parametrize('event_shape', __test_constants['event_shape'])
 def test_sigmoid_transformer(event_shape, batch_shape):
     torch.manual_seed(0)
 
@@ -38,8 +39,8 @@ def test_sigmoid_transformer(event_shape, batch_shape):
     assert torch.all(~torch.isinf(log_det_inverse))
 
 
-@pytest.mark.parametrize('batch_shape', [(7,), (25,), (13,), (2, 37)])
-@pytest.mark.parametrize('event_shape', [(1,), (5,), (100,), (3, 56, 2)])
+@pytest.mark.parametrize('batch_shape', __test_constants['batch_shape'])
+@pytest.mark.parametrize('event_shape', __test_constants['event_shape'])
 @pytest.mark.parametrize('hidden_dim', [1, 2, 4, 8, 16, 32])
 def test_deep_sigmoid_transformer(event_shape, batch_shape, hidden_dim):
     torch.manual_seed(0)
@@ -70,8 +71,8 @@ def test_deep_sigmoid_transformer(event_shape, batch_shape, hidden_dim):
     assert torch.all(~torch.isinf(log_det_inverse))
 
 
-@pytest.mark.parametrize('batch_shape', [(7,), (25,), (13,), (2, 37)])
-@pytest.mark.parametrize('event_shape', [(3, 56, 2), (2,), (5,), (100,)])
+@pytest.mark.parametrize('batch_shape', __test_constants['batch_shape'])
+@pytest.mark.parametrize('event_shape', __test_constants['event_shape'])
 def test_deep_sigmoid_coupling(event_shape, batch_shape):
     torch.manual_seed(0)
 
@@ -99,22 +100,13 @@ def test_deep_sigmoid_coupling(event_shape, batch_shape):
     assert torch.all(~torch.isinf(log_det_inverse))
 
 
-@pytest.mark.parametrize('batch_shape', [
-    (2, 37),
-    (7,),
-    (13,),
-    (25,),
-])
-@pytest.mark.parametrize('n_dim', [
-    1000,
-    2,
-    5,
-    100,
-])
-def test_deep_sigmoid_coupling_flow(n_dim, batch_shape):
+@pytest.mark.parametrize('batch_shape', __test_constants['batch_shape'])
+@pytest.mark.parametrize('event_shape', __test_constants['event_shape'])
+def test_deep_sigmoid_coupling_flow(event_shape, batch_shape):
     torch.manual_seed(0)
 
-    event_shape = torch.Size((n_dim,))
+    n_dim = int(torch.prod(torch.tensor(event_shape)))
+    event_shape = (n_dim,)  # Overwrite
 
     forward_flow = Flow(CouplingDSF(event_shape))
     x = torch.randn(size=(*batch_shape, n_dim))
diff --git a/test/test_spline.py b/test/test_spline.py
index 55dd02b..681e2c9 100644
--- a/test/test_spline.py
+++ b/test/test_spline.py
@@ -6,6 +6,7 @@
 from normalizing_flows.bijections.finite.autoregressive.transformers.spline.linear import Linear
 from normalizing_flows.bijections.finite.autoregressive.transformers.spline.cubic import Cubic
 from normalizing_flows.bijections.finite.autoregressive.transformers.spline.basis import Basis
+from test.constants import __test_constants
 
 
 def test_linear_rational():
@@ -81,8 +82,8 @@ def test_2d_spline(spline_class):
 
 
 @pytest.mark.parametrize('boundary', [1.0, 5.0, 50.0])
-@pytest.mark.parametrize('batch_shape', [(1,), (2,), (10,), (100,), (2, 5, 6, 3)])
-@pytest.mark.parametrize('event_shape', [(1,), (2,), (10,), (100,), (3, 4, 1)])
+@pytest.mark.parametrize('batch_shape', __test_constants["batch_shape"])
+@pytest.mark.parametrize('event_shape', __test_constants["event_shape"])
 @pytest.mark.parametrize('spline_class', [
     RationalQuadratic,
     LinearRational,
diff --git a/test/test_umnn.py b/test/test_umnn.py
index c6fe481..5b6f49a 100644
--- a/test/test_umnn.py
+++ b/test/test_umnn.py
@@ -5,10 +5,11 @@
 
 from normalizing_flows.bijections.finite.autoregressive.transformers.integration.unconstrained_monotonic_neural_network import \
     UnconstrainedMonotonicNeuralNetwork
+from test.constants import __test_constants
 
 
-@pytest.mark.parametrize('batch_shape', [(1,), (2,), (5,), (2, 4), (100,), (5, 1, 6, 7), (3, 13, 8)])
-@pytest.mark.parametrize('event_shape', [(2,), (3,), (2, 4), (25,)])
+@pytest.mark.parametrize('batch_shape', __test_constants["batch_shape"])
+@pytest.mark.parametrize('event_shape', __test_constants["event_shape"])
 def test_umnn(batch_shape: Tuple, event_shape: Tuple):
     # Event shape cannot be too big, otherwise
     torch.manual_seed(0)

From 8ac96f892c1f85772233e86fb5889d8b90e95fbc Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Mon, 25 Dec 2023 05:13:07 +0100
Subject: [PATCH 06/30] Rewrite elementwise bijection to avoid null conditioner

---
 .../finite/autoregressive/layers_base.py      | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/normalizing_flows/bijections/finite/autoregressive/layers_base.py b/normalizing_flows/bijections/finite/autoregressive/layers_base.py
index 83fa848..fab59fe 100644
--- a/normalizing_flows/bijections/finite/autoregressive/layers_base.py
+++ b/normalizing_flows/bijections/finite/autoregressive/layers_base.py
@@ -1,6 +1,7 @@
 from typing import Tuple, Optional, Union
 
 import torch
+import torch.nn as nn
 
 from normalizing_flows.bijections.finite.autoregressive.conditioners.base import Conditioner, NullConditioner
 from normalizing_flows.bijections.finite.autoregressive.conditioner_transforms import ConditionerTransform, Constant, \
@@ -160,7 +161,24 @@ class ElementwiseBijection(AutoregressiveBijection):
     def __init__(self, transformer: ScalarTransformer, fill_value: float = None):
         super().__init__(
             transformer.event_shape,
-            NullConditioner(),
+            None,
             transformer,
-            Constant(transformer.event_shape, transformer.parameter_shape, fill_value=fill_value)
+            None
         )
+
+        if fill_value is None:
+            self.value = nn.Parameter(torch.randn(*transformer.parameter_shape))
+        else:
+            self.value = nn.Parameter(torch.full(size=transformer.parameter_shape, fill_value=fill_value))
+
+    def prepare_h(self, batch_shape):
+        tmp = self.value[[None] * len(batch_shape)]
+        return tmp.repeat(*batch_shape, *([1] * len(self.transformer.parameter_shape)))
+
+    def forward(self, x: torch.Tensor, context: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        h = self.prepare_h(get_batch_shape(x, self.event_shape))
+        return self.transformer.forward(x, h)
+
+    def inverse(self, z: torch.Tensor, context: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        h = self.prepare_h(get_batch_shape(z, self.event_shape))
+        return self.transformer.inverse(z, h)

From 8a8572bbf257c9fd397b7c8ce6163ca505f4ada5 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Mon, 25 Dec 2023 05:17:37 +0100
Subject: [PATCH 07/30] Remove Conditioner and NullConditioner classes

---
 .../autoregressive/conditioners/base.py       | 21 -------------------
 .../autoregressive/conditioners/graphical.py  | 12 -----------
 .../autoregressive/conditioners/recurrent.py  |  0
 .../finite/autoregressive/layers_base.py      |  8 ++-----
 4 files changed, 2 insertions(+), 39 deletions(-)
 delete mode 100644 normalizing_flows/bijections/finite/autoregressive/conditioners/base.py
 delete mode 100644 normalizing_flows/bijections/finite/autoregressive/conditioners/graphical.py
 delete mode 100644 normalizing_flows/bijections/finite/autoregressive/conditioners/recurrent.py

diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioners/base.py b/normalizing_flows/bijections/finite/autoregressive/conditioners/base.py
deleted file mode 100644
index 7571640..0000000
--- a/normalizing_flows/bijections/finite/autoregressive/conditioners/base.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import torch
-import torch.nn as nn
-
-from normalizing_flows.bijections.finite.autoregressive.conditioner_transforms import ConditionerTransform
-
-
-class Conditioner(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: torch.Tensor, transform: ConditionerTransform, context: torch.Tensor = None, **kwargs) -> torch.Tensor:
-        raise NotImplementedError
-
-
-class NullConditioner(Conditioner):
-    def __init__(self):
-        # Each dimension affects only itself
-        super().__init__()
-
-    def forward(self, x: torch.Tensor, transform: ConditionerTransform, context: torch.Tensor = None) -> torch.Tensor:
-        return transform(x, context=context).to(x)  # (*batch_shape, *event_shape, n_parameters)
diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioners/graphical.py b/normalizing_flows/bijections/finite/autoregressive/conditioners/graphical.py
deleted file mode 100644
index 0fcee46..0000000
--- a/normalizing_flows/bijections/finite/autoregressive/conditioners/graphical.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import torch
-
-from normalizing_flows.bijections.finite.autoregressive.conditioners.base import Conditioner
-from normalizing_flows.bijections.finite.autoregressive.conditioner_transforms import ConditionerTransform
-
-
-class GraphicalConditioner(Conditioner):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: torch.Tensor, transform: ConditionerTransform, context: torch.Tensor = None) -> torch.Tensor:
-        pass
diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioners/recurrent.py b/normalizing_flows/bijections/finite/autoregressive/conditioners/recurrent.py
deleted file mode 100644
index e69de29..0000000
diff --git a/normalizing_flows/bijections/finite/autoregressive/layers_base.py b/normalizing_flows/bijections/finite/autoregressive/layers_base.py
index fab59fe..afc7a00 100644
--- a/normalizing_flows/bijections/finite/autoregressive/layers_base.py
+++ b/normalizing_flows/bijections/finite/autoregressive/layers_base.py
@@ -3,7 +3,6 @@
 import torch
 import torch.nn as nn
 
-from normalizing_flows.bijections.finite.autoregressive.conditioners.base import Conditioner, NullConditioner
 from normalizing_flows.bijections.finite.autoregressive.conditioner_transforms import ConditionerTransform, Constant, \
     MADE
 from normalizing_flows.bijections.finite.autoregressive.conditioners.coupling_masks import CouplingMask
@@ -15,12 +14,10 @@
 class AutoregressiveBijection(Bijection):
     def __init__(self,
                  event_shape,
-                 conditioner: Optional[Conditioner],
                  transformer: Union[TensorTransformer, ScalarTransformer],
                  conditioner_transform: ConditionerTransform,
                  **kwargs):
         super().__init__(event_shape=event_shape)
-        self.conditioner = conditioner
         self.conditioner_transform = conditioner_transform
         self.transformer = transformer
 
@@ -58,7 +55,7 @@ def __init__(self,
                  coupling_mask: CouplingMask,
                  conditioner_transform: ConditionerTransform,
                  **kwargs):
-        super().__init__(coupling_mask.event_shape, None, transformer, conditioner_transform, **kwargs)
+        super().__init__(coupling_mask.event_shape, transformer, conditioner_transform, **kwargs)
         self.coupling_mask = coupling_mask
 
         assert conditioner_transform.input_event_shape == (coupling_mask.constant_event_size,)
@@ -113,7 +110,7 @@ def __init__(self,
             context_shape=context_shape,
             **kwargs
         )
-        super().__init__(transformer.event_shape, None, transformer, conditioner_transform)
+        super().__init__(transformer.event_shape, transformer, conditioner_transform)
 
     def apply_conditioner_transformer(self, inputs, context, forward: bool = True):
         h = self.conditioner_transform(inputs, context)
@@ -161,7 +158,6 @@ class ElementwiseBijection(AutoregressiveBijection):
     def __init__(self, transformer: ScalarTransformer, fill_value: float = None):
         super().__init__(
             transformer.event_shape,
-            None,
             transformer,
             None
         )

From cc607f62762416f711a554e8fb936ef66f2b66cf Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Mon, 25 Dec 2023 05:24:05 +0100
Subject: [PATCH 08/30] Rename files

---
 .../{conditioners => conditioning}/__init__.py              | 0
 .../{conditioners => conditioning}/context.py               | 0
 .../{conditioners => conditioning}/coupling_masks.py        | 0
 .../transforms.py}                                          | 6 +++---
 .../bijections/finite/autoregressive/layers.py              | 4 ++--
 .../bijections/finite/autoregressive/layers_base.py         | 6 +++---
 test/test_conditioner_transforms.py                         | 4 ++--
 7 files changed, 10 insertions(+), 10 deletions(-)
 rename normalizing_flows/bijections/finite/autoregressive/{conditioners => conditioning}/__init__.py (100%)
 rename normalizing_flows/bijections/finite/autoregressive/{conditioners => conditioning}/context.py (100%)
 rename normalizing_flows/bijections/finite/autoregressive/{conditioners => conditioning}/coupling_masks.py (100%)
 rename normalizing_flows/bijections/finite/autoregressive/{conditioner_transforms.py => conditioning/transforms.py} (98%)

diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioners/__init__.py b/normalizing_flows/bijections/finite/autoregressive/conditioning/__init__.py
similarity index 100%
rename from normalizing_flows/bijections/finite/autoregressive/conditioners/__init__.py
rename to normalizing_flows/bijections/finite/autoregressive/conditioning/__init__.py
diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioners/context.py b/normalizing_flows/bijections/finite/autoregressive/conditioning/context.py
similarity index 100%
rename from normalizing_flows/bijections/finite/autoregressive/conditioners/context.py
rename to normalizing_flows/bijections/finite/autoregressive/conditioning/context.py
diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioners/coupling_masks.py b/normalizing_flows/bijections/finite/autoregressive/conditioning/coupling_masks.py
similarity index 100%
rename from normalizing_flows/bijections/finite/autoregressive/conditioners/coupling_masks.py
rename to normalizing_flows/bijections/finite/autoregressive/conditioning/coupling_masks.py
diff --git a/normalizing_flows/bijections/finite/autoregressive/conditioner_transforms.py b/normalizing_flows/bijections/finite/autoregressive/conditioning/transforms.py
similarity index 98%
rename from normalizing_flows/bijections/finite/autoregressive/conditioner_transforms.py
rename to normalizing_flows/bijections/finite/autoregressive/conditioning/transforms.py
index 8ff8306..74f3966 100644
--- a/normalizing_flows/bijections/finite/autoregressive/conditioner_transforms.py
+++ b/normalizing_flows/bijections/finite/autoregressive/conditioning/transforms.py
@@ -1,12 +1,12 @@
 import math
-from typing import Tuple, Union, Type, List
+from typing import Tuple, Union
 
 import torch
 import torch.nn as nn
 
-from normalizing_flows.bijections.finite.autoregressive.conditioners.context import Concatenation, ContextCombiner, \
+from normalizing_flows.bijections.finite.autoregressive.conditioning.context import Concatenation, ContextCombiner, \
     Bypass
-from normalizing_flows.utils import get_batch_shape, pad_leading_dims
+from normalizing_flows.utils import get_batch_shape
 
 
 class ConditionerTransform(nn.Module):
diff --git a/normalizing_flows/bijections/finite/autoregressive/layers.py b/normalizing_flows/bijections/finite/autoregressive/layers.py
index b340cbe..15b74ff 100644
--- a/normalizing_flows/bijections/finite/autoregressive/layers.py
+++ b/normalizing_flows/bijections/finite/autoregressive/layers.py
@@ -1,7 +1,7 @@
 import torch
 
-from normalizing_flows.bijections.finite.autoregressive.conditioner_transforms import FeedForward
-from normalizing_flows.bijections.finite.autoregressive.conditioners.coupling_masks import HalfSplit
+from normalizing_flows.bijections.finite.autoregressive.conditioning.transforms import FeedForward
+from normalizing_flows.bijections.finite.autoregressive.conditioning.coupling_masks import HalfSplit
 from normalizing_flows.bijections.finite.autoregressive.layers_base import MaskedAutoregressiveBijection, \
     InverseMaskedAutoregressiveBijection, ElementwiseBijection, CouplingBijection
 from normalizing_flows.bijections.finite.autoregressive.transformers.linear.affine import Scale, Affine, Shift
diff --git a/normalizing_flows/bijections/finite/autoregressive/layers_base.py b/normalizing_flows/bijections/finite/autoregressive/layers_base.py
index afc7a00..6562dda 100644
--- a/normalizing_flows/bijections/finite/autoregressive/layers_base.py
+++ b/normalizing_flows/bijections/finite/autoregressive/layers_base.py
@@ -1,11 +1,11 @@
-from typing import Tuple, Optional, Union
+from typing import Tuple, Union
 
 import torch
 import torch.nn as nn
 
-from normalizing_flows.bijections.finite.autoregressive.conditioner_transforms import ConditionerTransform, Constant, \
+from normalizing_flows.bijections.finite.autoregressive.conditioning.transforms import ConditionerTransform, \
     MADE
-from normalizing_flows.bijections.finite.autoregressive.conditioners.coupling_masks import CouplingMask
+from normalizing_flows.bijections.finite.autoregressive.conditioning.coupling_masks import CouplingMask
 from normalizing_flows.bijections.finite.autoregressive.transformers.base import TensorTransformer, ScalarTransformer
 from normalizing_flows.bijections.base import Bijection
 from normalizing_flows.utils import flatten_event, unflatten_event, get_batch_shape
diff --git a/test/test_conditioner_transforms.py b/test/test_conditioner_transforms.py
index 79ac5fc..551e70c 100644
--- a/test/test_conditioner_transforms.py
+++ b/test/test_conditioner_transforms.py
@@ -1,8 +1,8 @@
 import pytest
 import torch
 
-from normalizing_flows.bijections.finite.autoregressive.conditioner_transforms import (
-    MADE, FeedForward, LinearMADE, ResidualFeedForward, Constant, Linear, ConditionerTransform
+from normalizing_flows.bijections.finite.autoregressive.conditioning.transforms import (
+    MADE, FeedForward, LinearMADE, ResidualFeedForward, Linear, ConditionerTransform
 )
 from test.constants import __test_constants
 

From c0e239ef6f37a30ce3c517c99ddc7c16841a02c2 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 16:49:31 +0100
Subject: [PATCH 09/30] Towards fixing autograd in residual flows

---
 normalizing_flows/bijections/finite/residual/base.py | 10 +++++++---
 .../bijections/finite/residual/iterative.py          |  2 +-
 .../finite/residual/log_abs_det_estimators.py        | 12 +++++++-----
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/normalizing_flows/bijections/finite/residual/base.py b/normalizing_flows/bijections/finite/residual/base.py
index 2954428..766ebf5 100644
--- a/normalizing_flows/bijections/finite/residual/base.py
+++ b/normalizing_flows/bijections/finite/residual/base.py
@@ -17,7 +17,7 @@ def __init__(self, event_shape: Union[torch.Size, Tuple[int, ...]]):
         super().__init__(event_shape)
         self.g: callable = None
 
-    def log_det(self, x):
+    def log_det(self, x, **kwargs):
         raise NotImplementedError
 
     def forward(self,
@@ -30,7 +30,9 @@ def forward(self,
         if skip_log_det:
             log_det = torch.full(size=batch_shape, fill_value=torch.nan)
         else:
-            log_det = self.log_det(flatten_event(x, self.event_shape))
+            x_flat = flatten_event(x, self.event_shape).clone()
+            x_flat.requires_grad_(True)
+            log_det = self.log_det(x_flat, training=self.training)
 
         return z, log_det
 
@@ -47,7 +49,9 @@ def inverse(self,
         if skip_log_det:
             log_det = torch.full(size=batch_shape, fill_value=torch.nan)
         else:
-            log_det = -self.log_det(flatten_event(x, self.event_shape))
+            x_flat = flatten_event(x, self.event_shape).clone()
+            x_flat.requires_grad_(True)
+            log_det = -self.log_det(x_flat, training=self.training)
 
         return x, log_det
 
diff --git a/normalizing_flows/bijections/finite/residual/iterative.py b/normalizing_flows/bijections/finite/residual/iterative.py
index 4b58553..c778350 100644
--- a/normalizing_flows/bijections/finite/residual/iterative.py
+++ b/normalizing_flows/bijections/finite/residual/iterative.py
@@ -81,4 +81,4 @@ def __init__(self, event_shape: Union[torch.Size, Tuple[int, ...]], context_shap
         super().__init__(event_shape)
 
     def log_det(self, x: torch.Tensor, **kwargs):
-        return log_det_roulette(self.g, x)[1]
+        return log_det_roulette(self.g, x, **kwargs)[1]
diff --git a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
index d3b644f..ca557e5 100644
--- a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
+++ b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
@@ -21,22 +21,24 @@ def power_series_log_abs_det_estimator(g: callable,
     assert n_iterations >= 2
 
     w = noise  # (batch_size, event_size, n_hutchinson_samples)
-    log_abs_det_jac_f = torch.zeros(size=(batch_size,))
+    log_abs_det_jac_f = torch.zeros(size=(batch_size,)).to(x)
     g_value = None
     for k in range(1, n_iterations + 1):
         # Compute VJP, reshape appropriately for hutchinson averaging
         gs_r, ws_r = torch.autograd.functional.vjp(
             g,
             x[..., None].repeat(1, 1, n_hutchinson_samples).view(batch_size * n_hutchinson_samples, event_size),
-            w.view(batch_size * n_hutchinson_samples, event_size)
+            w.view(batch_size * n_hutchinson_samples, event_size),
+            create_graph=training
         )
 
         if g_value is None:
             g_value = gs_r.view(batch_size, event_size, n_hutchinson_samples)[..., 0]
 
         w = ws_r.view(batch_size, event_size, n_hutchinson_samples)
-        log_abs_det_jac_f += (-1) ** (k + 1) / k * torch.sum(w * noise, dim=1).mean(
-            dim=1)  # sum over event dim, average over hutchinson dim
+
+        # sum over event dim, average over hutchinson dim
+        log_abs_det_jac_f += (-1) ** (k + 1) / k * torch.sum(w * noise, dim=1).mean(dim=1)
         assert log_abs_det_jac_f.shape == (batch_size,)
     return g_value, log_abs_det_jac_f
 
@@ -79,7 +81,7 @@ class LogDeterminantEstimator(torch.autograd.Function):
     Autodiff support permits this function to be used in a computation graph.
     """
 
-    # https://github.com/rtqichen/residual-flows/blob/master/lib/layers/iresblock.py#L186
+    # https://github.com/rtqichen/residual-flows/blob/master/resflows/layers/iresblock.py#L249
     @staticmethod
     def forward(ctx,
                 estimator_function: callable,

From 6a1785994b6fa23f039ab43c7b020b3940e21588 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 17:01:48 +0100
Subject: [PATCH 10/30] Fix autograd for InvertibleResNet

---
 .../bijections/finite/residual/log_abs_det_estimators.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
index ca557e5..1a4ec20 100644
--- a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
+++ b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
@@ -127,7 +127,7 @@ def backward(ctx, grad_g, grad_logdetgrad):
             g_params = params_and_grad[:len(params_and_grad) // 2]
             grad_params = params_and_grad[len(params_and_grad) // 2:]
 
-            dg_x, *dg_params = torch.autograd.grad(g_value, [x] + g_params, grad_g, allow_unused=True)
+            dg_x, *dg_params = torch.autograd.grad(g_value, [x] + g_params, grad_g, allow_unused=True, retain_graph=training)
 
         # Update based on gradient from log determinant.
         dL = grad_logdetgrad[0].detach()
@@ -140,7 +140,7 @@ def backward(ctx, grad_g, grad_logdetgrad):
             grad_x.add_(dg_x)
             grad_params = tuple([dg.add_(djac) if djac is not None else dg for dg, djac in zip(dg_params, grad_params)])
 
-        return (None, None, grad_x, None, None, None, None) + grad_params
+        return (None, None, grad_x, None, None) + grad_params
 
 
 def log_det_roulette(g: nn.Module, x: torch.Tensor, training: bool = False, p: float = 0.5):

From 952b3a2b6577e6f8160a1e9473d9fc73d5c2f697 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 17:07:04 +0100
Subject: [PATCH 11/30] Fixing autograd for ResFlow

---
 .../bijections/finite/residual/log_abs_det_estimators.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
index 1a4ec20..813c79e 100644
--- a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
+++ b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
@@ -69,7 +69,7 @@ def roulette_log_abs_det_estimator(g: callable,
             # P(N >= k) = 1 - P(N < k) = 1 - P(N <= k - 1) = 1 - cdf(k - 1)
             p_k = 1 - dist.cdf(torch.tensor(k - 1, dtype=torch.long))
             neumann_vjp = neumann_vjp + (-1) ** k / (k * p_k) * w
-    g_value, vjp_jac = torch.autograd.functional.vjp(g, x, neumann_vjp)
+    g_value, vjp_jac = torch.autograd.functional.vjp(g, x, neumann_vjp, create_graph=True)
     # vjp_jac = torch.autograd.grad(g_value, x, neumann_vjp, create_graph=training)[0]
     log_abs_det_jac_f = torch.sum(vjp_jac * noise, dim=-1)
     return g_value, log_abs_det_jac_f

From 3df736256ceee8300dceba5bda030d85ef43656d Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 17:09:47 +0100
Subject: [PATCH 12/30] Create graph in roulette estimator only if training

---
 .../bijections/finite/residual/log_abs_det_estimators.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
index 813c79e..018ddb1 100644
--- a/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
+++ b/normalizing_flows/bijections/finite/residual/log_abs_det_estimators.py
@@ -69,7 +69,7 @@ def roulette_log_abs_det_estimator(g: callable,
             # P(N >= k) = 1 - P(N < k) = 1 - P(N <= k - 1) = 1 - cdf(k - 1)
             p_k = 1 - dist.cdf(torch.tensor(k - 1, dtype=torch.long))
             neumann_vjp = neumann_vjp + (-1) ** k / (k * p_k) * w
-    g_value, vjp_jac = torch.autograd.functional.vjp(g, x, neumann_vjp, create_graph=True)
+    g_value, vjp_jac = torch.autograd.functional.vjp(g, x, neumann_vjp, create_graph=training)
     # vjp_jac = torch.autograd.grad(g_value, x, neumann_vjp, create_graph=training)[0]
     log_abs_det_jac_f = torch.sum(vjp_jac * noise, dim=-1)
     return g_value, log_abs_det_jac_f

From 43168a579b83633167f5cb49f2f3a54e7a410136 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 17:17:11 +0100
Subject: [PATCH 13/30] Set default hidden layer size of spectral neural
 network to max(log(n_dim), 4)

---
 normalizing_flows/bijections/finite/residual/iterative.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/normalizing_flows/bijections/finite/residual/iterative.py b/normalizing_flows/bijections/finite/residual/iterative.py
index c778350..36009c2 100644
--- a/normalizing_flows/bijections/finite/residual/iterative.py
+++ b/normalizing_flows/bijections/finite/residual/iterative.py
@@ -1,3 +1,4 @@
+import math
 from typing import Union, Tuple
 
 import torch
@@ -52,7 +53,10 @@ def forward(self, x):
 
 
 class SpectralNeuralNetwork(nn.Sequential):
-    def __init__(self, n_dim: int, n_hidden: int = 100, n_hidden_layers: int = 2, **kwargs):
+    def __init__(self, n_dim: int, n_hidden: int = None, n_hidden_layers: int = 2, **kwargs):
+        if n_hidden is None:
+            n_hidden = int(max(math.log(n_dim), 4))
+
         layers = []
         if n_hidden_layers == 0:
             layers = [SpectralLinear(n_dim, n_dim, **kwargs)]

From a50432a3d5f6cd76d398f9151a6ebd4bf30ab1c2 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 17:21:54 +0100
Subject: [PATCH 14/30] Change default hidden layer size in proximal neural
 network

---
 normalizing_flows/bijections/finite/residual/proximal.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/normalizing_flows/bijections/finite/residual/proximal.py b/normalizing_flows/bijections/finite/residual/proximal.py
index a138bf7..e7e3e06 100644
--- a/normalizing_flows/bijections/finite/residual/proximal.py
+++ b/normalizing_flows/bijections/finite/residual/proximal.py
@@ -1,3 +1,4 @@
+import math
 from typing import Union, Tuple, Optional
 
 import torch
@@ -104,9 +105,11 @@ class PNN(nn.Sequential):
     Proximal neural network
     """
 
-    def __init__(self, event_size: int, n_layers: int = 1, hidden_size: int = 100, act: ProximityOperator = None):
+    def __init__(self, event_size: int, n_layers: int = 1, hidden_size: int = None, act: ProximityOperator = None):
         if act is None:
             act = TanH()
+        if hidden_size is None:
+            hidden_size = max(math.log(event_size), 4)
         super().__init__(*[PNNBlock(event_size, hidden_size, act) for _ in range(n_layers)])
         self.n_layers = n_layers
         self.act = act

From 8ee1e7c95a57d0d3813ea5bf50391c8a5a7bdb52 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 17:39:30 +0100
Subject: [PATCH 15/30] Set default hidden size in OTFlow

---
 normalizing_flows/bijections/continuous/otflow.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/normalizing_flows/bijections/continuous/otflow.py b/normalizing_flows/bijections/continuous/otflow.py
index e56916e..31a5f7d 100644
--- a/normalizing_flows/bijections/continuous/otflow.py
+++ b/normalizing_flows/bijections/continuous/otflow.py
@@ -1,3 +1,4 @@
+import math
 from typing import Union, Tuple
 
 import torch
@@ -138,9 +139,13 @@ def hessian_trace(self,
 
 
 class OTPotential(TimeDerivative):
-    def __init__(self, event_size: int, hidden_size: int, **kwargs):
+    def __init__(self, event_size: int, hidden_size: int = None, **kwargs):
         super().__init__()
+
         # hidden_size = m
+        if hidden_size is None:
+            hidden_size = max(math.log(event_size), 4)
+
         r = min(10, event_size)
 
         # Initialize w to 1
@@ -187,8 +192,8 @@ def hessian_trace(self, s: torch.Tensor, u0: torch.Tensor = None, z1: torch.Tens
 
 
 class OTFlowODEFunction(ExactODEFunction):
-    def __init__(self, n_dim):
-        super().__init__(OTPotential(n_dim, hidden_size=30))
+    def __init__(self, n_dim, **kwargs):
+        super().__init__(OTPotential(n_dim, **kwargs))
 
     def compute_log_det(self, t, x):
         return self.diffeq.hessian_trace(concatenate_x_t(x, t)).view(-1, 1)  # Need an empty dim at the end

From e82e79a1a22cf4787e0650346055d2f049df19c2 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 19:07:14 +0100
Subject: [PATCH 16/30] Fix OTFlow forward output sign

---
 .../bijections/continuous/otflow.py           | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/normalizing_flows/bijections/continuous/otflow.py b/normalizing_flows/bijections/continuous/otflow.py
index 31a5f7d..5021504 100644
--- a/normalizing_flows/bijections/continuous/otflow.py
+++ b/normalizing_flows/bijections/continuous/otflow.py
@@ -30,19 +30,21 @@ def __init__(self, event_size: int, hidden_size: int, step_size: float = 0.01):
 
         divisor = max(event_size ** 2, 10)
 
-        K0_delta = torch.randn(size=(hidden_size, event_size)) / divisor
-        b0_delta = torch.randn(size=(hidden_size,)) / divisor
+        self.K0_delta = nn.Parameter(torch.randn(size=(hidden_size, event_size)) / divisor)
+        self.b0 = nn.Parameter(torch.randn(size=(hidden_size,)) / divisor)
 
-        K1_delta = torch.randn(size=(hidden_size, hidden_size)) / divisor
-        b1_delta = torch.randn(size=(hidden_size,)) / divisor
+        self.K1_delta = nn.Parameter(torch.randn(size=(hidden_size, hidden_size)) / divisor)
+        self.b1 = nn.Parameter(torch.randn(size=(hidden_size,)) / divisor)
 
-        self.K0 = nn.Parameter(torch.eye(hidden_size, event_size) + K0_delta)
-        self.b0 = nn.Parameter(0 + b0_delta)
+        self.step_size = step_size
 
-        self.K1 = nn.Parameter(torch.eye(hidden_size, hidden_size) + K1_delta)
-        self.b1 = nn.Parameter(0 + b1_delta)
+    @property
+    def K0(self):
+        return torch.eye(*self.K0_delta.shape) + self.K0_delta / 1000
 
-        self.step_size = step_size
+    @property
+    def K1(self):
+        return torch.eye(*self.K1_delta.shape) + self.K1_delta / 1000
 
     @staticmethod
     def sigma(x):
@@ -115,7 +117,7 @@ def hessian_trace(self,
 
         t0 = torch.sum(
             torch.multiply(
-                (self.sigma_prime_prime(torch.nn.functional.linear(s, self.K0, self.b0)) * z1),
+                self.sigma_prime_prime(torch.nn.functional.linear(s, self.K0, self.b0)) * z1,
                 torch.nn.functional.linear(ones, self.K0[:, :-1] ** 2)
             ),
             dim=1
@@ -164,7 +166,7 @@ def __init__(self, event_size: int, hidden_size: int = None, **kwargs):
         self.resnet = OTResNet(event_size + 1, hidden_size, **kwargs)  # (x, t) has d+1 elements
 
     def forward(self, t, x):
-        return -self.gradient(concatenate_x_t(x, t))
+        return self.gradient(concatenate_x_t(x, t))
 
     def gradient(self, s):
         # Equation 12

From 962472566012489587adf2e25fb9dffbe89ca908 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 19:09:21 +0100
Subject: [PATCH 17/30] Handle training=True for residual bijections in
 Flow.fit

---
 normalizing_flows/flows.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/normalizing_flows/flows.py b/normalizing_flows/flows.py
index 43f949b..4bf670b 100644
--- a/normalizing_flows/flows.py
+++ b/normalizing_flows/flows.py
@@ -152,6 +152,8 @@ def fit(self,
         :param early_stopping: if True and validation data is provided, stop the training procedure early once validation loss stops improving for a specified number of consecutive epochs.
         :param early_stopping_threshold: if early_stopping is True, fitting stops after no improvement in validation loss for this many epochs.
         """
+        self.bijection.train()
+
         # Compute the number of event dimensions
         n_event_dims = int(torch.prod(torch.as_tensor(self.bijection.event_shape)))
 
@@ -249,6 +251,8 @@ def compute_batch_loss(batch_, reduction: callable = torch.mean):
         if x_val is not None and keep_best_weights:
             self.load_state_dict(best_weights)
 
+        self.bijection.eval()
+
     def variational_fit(self,
                         target,
                         n_epochs: int = 10,

From 549e5c083fca7bf07e7deef505532f65418bce62 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 19:24:41 +0100
Subject: [PATCH 18/30] Fix divergence sign for FFJORD, RNODE

---
 normalizing_flows/bijections/continuous/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/normalizing_flows/bijections/continuous/base.py b/normalizing_flows/bijections/continuous/base.py
index 6751ee5..24b5677 100644
--- a/normalizing_flows/bijections/continuous/base.py
+++ b/normalizing_flows/bijections/continuous/base.py
@@ -209,7 +209,7 @@ def forward(self, t, states):
                 s_.requires_grad_(True)
             dy = self.diffeq(t, y, *states[2:])
             divergence = self.divergence_step(dy, y)
-        return tuple([dy, -divergence] + [torch.zeros_like(s_).requires_grad_(True) for s_ in states[2:]])
+        return tuple([dy, divergence] + [torch.zeros_like(s_).requires_grad_(True) for s_ in states[2:]])
 
 
 class RegularizedApproximateODEFunction(ApproximateODEFunction):

From c8be5c1f25af6ecc717e9d14f945bc5ecc353be5 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 22:15:49 +0100
Subject: [PATCH 19/30] Fix gamma in Proximal ResFlow with single layer blocks

---
 .../bijections/finite/residual/proximal.py         | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/normalizing_flows/bijections/finite/residual/proximal.py b/normalizing_flows/bijections/finite/residual/proximal.py
index e7e3e06..c6c23ef 100644
--- a/normalizing_flows/bijections/finite/residual/proximal.py
+++ b/normalizing_flows/bijections/finite/residual/proximal.py
@@ -120,10 +120,10 @@ def t(self):
 
 
 class ProximalResFlowBlockIncrement(nn.Module):
-    def __init__(self, pnn: PNN, gamma: float):
+    def __init__(self, pnn: PNN, gamma: float, max_gamma: float):
         super().__init__()
         self.gamma = gamma
-        self.max_gamma = (pnn.n_layers + 1) / (pnn.n_layers - 1 + 1e-6)
+        self.max_gamma = max_gamma
         assert 0 < gamma < self.max_gamma, f'{gamma = }, {self.max_gamma = }'
         self.phi = pnn
 
@@ -159,14 +159,20 @@ def __init__(self,
 
         # Set gamma
         assert n_layers > 0
-        self.max_gamma = (n_layers + 1) / (n_layers - 1 + 1e-6)
+
+        if n_layers > 1:
+            self.max_gamma = (n_layers + 1) / (n_layers - 1)
+        else:
+            self.max_gamma = 1.5
+
         if gamma is None:
             gamma = self.max_gamma - 1e-2
         assert 0 < gamma < self.max_gamma
 
         self.g = ProximalResFlowBlockIncrement(
             pnn=PNN(event_size=self.n_dim, n_layers=n_layers, **kwargs),
-            gamma=gamma
+            gamma=gamma,
+            max_gamma=self.max_gamma
         )
 
     def log_det(self, x, **kwargs):

From b736d6f4dd9f2c64dbe8f2e8b9e53eea219bbe7f Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 22:16:25 +0100
Subject: [PATCH 20/30] Change defaults for invertible ResNet and ResFlow

---
 .../bijections/finite/residual/iterative.py         | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/normalizing_flows/bijections/finite/residual/iterative.py b/normalizing_flows/bijections/finite/residual/iterative.py
index 36009c2..00f964e 100644
--- a/normalizing_flows/bijections/finite/residual/iterative.py
+++ b/normalizing_flows/bijections/finite/residual/iterative.py
@@ -11,7 +11,7 @@
 class SpectralLinear(nn.Module):
     # https://arxiv.org/pdf/1811.00995.pdf
 
-    def __init__(self, n_inputs: int, n_outputs: int, c: float = 0.97, n_iterations: int = 25):
+    def __init__(self, n_inputs: int, n_outputs: int, c: float = 0.7, n_iterations: int = 5):
         super().__init__()
         self.c = c
         self.n_inputs = n_inputs
@@ -53,9 +53,9 @@ def forward(self, x):
 
 
 class SpectralNeuralNetwork(nn.Sequential):
-    def __init__(self, n_dim: int, n_hidden: int = None, n_hidden_layers: int = 2, **kwargs):
+    def __init__(self, n_dim: int, n_hidden: int = None, n_hidden_layers: int = 1, **kwargs):
         if n_hidden is None:
-            n_hidden = int(max(math.log(n_dim), 4))
+            n_hidden = int(3 * max(math.log(n_dim), 4))
 
         layers = []
         if n_hidden_layers == 0:
@@ -76,13 +76,14 @@ def __init__(self, event_shape: Union[torch.Size, Tuple[int, ...]], context_shap
         self.g = SpectralNeuralNetwork(n_dim=self.n_dim, **kwargs)
 
     def log_det(self, x: torch.Tensor, **kwargs):
-        return log_det_power_series(self.g, x, **kwargs)[1]
+        return log_det_power_series(self.g, x, n_iterations=2, **kwargs)[1]
 
 
 class ResFlowBlock(InvertibleResNetBlock):
     def __init__(self, event_shape: Union[torch.Size, Tuple[int, ...]], context_shape=None, p: float = 0.5, **kwargs):
         # TODO add context
-        super().__init__(event_shape)
+        self.p = p
+        super().__init__(event_shape, **kwargs)
 
     def log_det(self, x: torch.Tensor, **kwargs):
-        return log_det_roulette(self.g, x, **kwargs)[1]
+        return log_det_roulette(self.g, x, p=self.p, **kwargs)[1]

From abb32dce10cc79f2d405bd81911074aed7a3278f Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 22:17:04 +0100
Subject: [PATCH 21/30] Fix log determinant in residual bijection, interleave
 layers with elementwise affine maps

---
 .../bijections/finite/residual/base.py           | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/normalizing_flows/bijections/finite/residual/base.py b/normalizing_flows/bijections/finite/residual/base.py
index 766ebf5..8f7b8f8 100644
--- a/normalizing_flows/bijections/finite/residual/base.py
+++ b/normalizing_flows/bijections/finite/residual/base.py
@@ -2,6 +2,7 @@
 
 import torch
 
+from normalizing_flows.bijections.finite.autoregressive.layers import ElementwiseAffine
 from normalizing_flows.bijections.base import Bijection, BijectiveComposition
 from normalizing_flows.utils import get_batch_shape, unflatten_event, flatten_event
 
@@ -32,7 +33,7 @@ def forward(self,
         else:
             x_flat = flatten_event(x, self.event_shape).clone()
             x_flat.requires_grad_(True)
-            log_det = self.log_det(x_flat, training=self.training)
+            log_det = -self.log_det(x_flat, training=self.training)
 
         return z, log_det
 
@@ -59,8 +60,15 @@ def inverse(self,
 class ResidualComposition(BijectiveComposition):
     def __init__(self, blocks: List[ResidualBijection]):
         assert len(blocks) > 0
+        event_shape = blocks[0].event_shape
+
+        updated_layers = [ElementwiseAffine(event_shape)]
+        for i in range(len(blocks)):
+            updated_layers.append(blocks[i])
+            updated_layers.append(ElementwiseAffine(event_shape))
+
         super().__init__(
-            event_shape=blocks[0].event_shape,
-            layers=blocks,
-            context_shape=blocks[0].context_shape
+            event_shape=updated_layers[0].event_shape,
+            layers=updated_layers,
+            context_shape=updated_layers[0].context_shape
         )

From 2560814ed228cbd75778c8dee356e44346e8f34a Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 22:19:28 +0100
Subject: [PATCH 22/30] Use different residual blocks in Invertible ResNet and
 ResFlow

---
 .../bijections/finite/residual/architectures.py      | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/normalizing_flows/bijections/finite/residual/architectures.py b/normalizing_flows/bijections/finite/residual/architectures.py
index 2351987..21594cf 100644
--- a/normalizing_flows/bijections/finite/residual/architectures.py
+++ b/normalizing_flows/bijections/finite/residual/architectures.py
@@ -14,15 +14,19 @@
 
 class InvertibleResNet(ResidualComposition):
     def __init__(self, event_shape, context_shape=None, n_layers: int = 16, **kwargs):
-        block = InvertibleResNetBlock(event_shape=event_shape, context_shape=context_shape, **kwargs)
-        blocks = [block for _ in range(n_layers)]  # The same block
+        blocks = [
+            InvertibleResNetBlock(event_shape=event_shape, context_shape=context_shape, **kwargs)
+            for _ in range(n_layers)
+        ]
         super().__init__(blocks)
 
 
 class ResFlow(ResidualComposition):
     def __init__(self, event_shape, context_shape=None, n_layers: int = 16, **kwargs):
-        block = ResFlowBlock(event_shape=event_shape, context_shape=context_shape, **kwargs)
-        blocks = [block for _ in range(n_layers)]  # The same block
+        blocks = [
+            ResFlowBlock(event_shape=event_shape, context_shape=context_shape, **kwargs)
+            for _ in range(n_layers)
+        ]
         super().__init__(blocks)
 
 

From 5aaf37aa76725e281d003fa3f5127c22f08e5160 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 22:30:03 +0100
Subject: [PATCH 23/30] Change ProximalResFlow defaults

---
 .../bijections/finite/residual/architectures.py    |  6 ++++--
 .../bijections/finite/residual/proximal.py         | 14 +++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/normalizing_flows/bijections/finite/residual/architectures.py b/normalizing_flows/bijections/finite/residual/architectures.py
index 21594cf..2c5304a 100644
--- a/normalizing_flows/bijections/finite/residual/architectures.py
+++ b/normalizing_flows/bijections/finite/residual/architectures.py
@@ -32,8 +32,10 @@ def __init__(self, event_shape, context_shape=None, n_layers: int = 16, **kwargs
 
 class ProximalResFlow(ResidualComposition):
     def __init__(self, event_shape, context_shape=None, n_layers: int = 16, **kwargs):
-        block = ProximalResFlowBlock(event_shape=event_shape, context_shape=context_shape, **kwargs)
-        blocks = [block for _ in range(n_layers)]  # The same block
+        blocks = [
+            ProximalResFlowBlock(event_shape=event_shape, context_shape=context_shape, gamma=0.01, **kwargs)
+            for _ in range(n_layers)
+        ]
         super().__init__(blocks)
 
 
diff --git a/normalizing_flows/bijections/finite/residual/proximal.py b/normalizing_flows/bijections/finite/residual/proximal.py
index c6c23ef..65530ec 100644
--- a/normalizing_flows/bijections/finite/residual/proximal.py
+++ b/normalizing_flows/bijections/finite/residual/proximal.py
@@ -69,15 +69,15 @@ def __init__(self, event_size: int, hidden_size: int, act: ProximityOperator):
         # Initialize b close to 0
         # Initialize t_tilde close to identity
 
-        identity = torch.eye(self.hidden_size, self.event_size)
         divisor = max(self.event_size ** 2, 100)
-        delta_b = torch.randn(self.hidden_size) / divisor
-        delta_t_tilde = torch.randn(self.hidden_size, self.event_size) / divisor
-
-        self.b = nn.Parameter(0 + delta_b)
-        self.t_tilde = nn.Parameter(identity + delta_t_tilde)
+        self.b = nn.Parameter(torch.randn(self.hidden_size) / divisor)
+        self.delta_t_tilde = nn.Parameter(torch.randn(self.hidden_size, self.event_size) / divisor)
         self.act = act
 
+    @property
+    def t_tilde(self):
+        return torch.eye(self.hidden_size, self.event_size) + self.delta_t_tilde
+
     @property
     def stiefel_matrix(self, n_iterations: int = 4):
         # output has shape (hidden_size, event_size)
@@ -185,7 +185,7 @@ def inverse(self,
                 z: torch.Tensor,
                 context: torch.Tensor = None,
                 skip_log_det: bool = False,
-                n_iterations: int = 500,
+                n_iterations: int = 25,
                 **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
         gamma = self.g.gamma
         t = self.g.phi.t

From fd4dad88fa3288f8b1729122ff3acf6fe64f2ef2 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 23:11:48 +0100
Subject: [PATCH 24/30] Fix Gaussian quadrature call for UMNN-MAF

---
 .../bijections/finite/autoregressive/util.py              | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/normalizing_flows/bijections/finite/autoregressive/util.py b/normalizing_flows/bijections/finite/autoregressive/util.py
index fbbc725..42b7108 100644
--- a/normalizing_flows/bijections/finite/autoregressive/util.py
+++ b/normalizing_flows/bijections/finite/autoregressive/util.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Tuple, List
+from typing import List, Optional, Tuple
 
 import numpy as np
 import torch.autograd
@@ -10,13 +10,13 @@
 
 class GaussLegendre(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, f, a: torch.Tensor, b: torch.Tensor, n: int, h: List[torch.Tensor]) -> torch.Tensor:
+    def forward(ctx, f, a: torch.Tensor, b: torch.Tensor, n: int, *h: List[torch.Tensor]) -> torch.Tensor:
         ctx.f, ctx.n = f, n
         ctx.save_for_backward(a, b, *h)
         return GaussLegendre.quadrature(f, a, b, n, h)
 
     @staticmethod
-    def backward(ctx, grad_area: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+    def backward(ctx, grad_area: torch.Tensor) -> Tuple[Optional[torch.Tensor], ...]:
         f, n = ctx.f, ctx.n
         a, b, *h = ctx.saved_tensors
 
@@ -62,4 +62,4 @@ def nodes(n: int, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
 def gauss_legendre(f, a, b, n, h):
-    return GaussLegendre.apply(f, a, b, n, h)
+    return GaussLegendre.apply(f, a, b, n, *h)

From 3e708b13ec965f49e20cf5c98734c21dc1303600 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 23:23:43 +0100
Subject: [PATCH 25/30] Better skip message for residual bijection
 reconstruction test

---
 test/test_reconstruction_bijections.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_reconstruction_bijections.py b/test/test_reconstruction_bijections.py
index 4aaa45e..934dbdb 100644
--- a/test/test_reconstruction_bijections.py
+++ b/test/test_reconstruction_bijections.py
@@ -166,7 +166,7 @@ def test_masked_autoregressive(bijection_class: Bijection, batch_shape: Tuple, e
     assert_valid_reconstruction(bijection, x, context)
 
 
-@pytest.mark.skip(reason="Computation takes too long")
+@pytest.mark.skip(reason="Computation takes too long / inherently inaccurate")
 @pytest.mark.parametrize('bijection_class', [
     ProximalResFlowBlock,
     InvertibleResNetBlock,

From 302665479e32ebda473abc8d637717ac957c606a Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Wed, 27 Dec 2023 23:39:43 +0100
Subject: [PATCH 26/30] Change UMNN MAF parametrization

---
 .../finite/autoregressive/architectures.py      |  2 +-
 .../bijections/finite/autoregressive/layers.py  |  4 ++--
 .../unconstrained_monotonic_neural_network.py   | 17 ++++++++++-------
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/normalizing_flows/bijections/finite/autoregressive/architectures.py b/normalizing_flows/bijections/finite/autoregressive/architectures.py
index cca7e34..ca31f0a 100644
--- a/normalizing_flows/bijections/finite/autoregressive/architectures.py
+++ b/normalizing_flows/bijections/finite/autoregressive/architectures.py
@@ -180,7 +180,7 @@ def __init__(self, event_shape, n_layers: int = 2, **kwargs):
 
 
 class UMNNMAF(BijectiveComposition):
-    def __init__(self, event_shape, n_layers: int = 2, **kwargs):
+    def __init__(self, event_shape, n_layers: int = 1, **kwargs):
         if isinstance(event_shape, int):
             event_shape = (event_shape,)
         bijections = [ElementwiseAffine(event_shape=event_shape)]
diff --git a/normalizing_flows/bijections/finite/autoregressive/layers.py b/normalizing_flows/bijections/finite/autoregressive/layers.py
index 15b74ff..f5ba2c7 100644
--- a/normalizing_flows/bijections/finite/autoregressive/layers.py
+++ b/normalizing_flows/bijections/finite/autoregressive/layers.py
@@ -223,8 +223,8 @@ class UMNNMaskedAutoregressive(MaskedAutoregressiveBijection):
     def __init__(self,
                  event_shape: torch.Size,
                  context_shape: torch.Size = None,
-                 n_hidden_layers: int = 1,
-                 hidden_dim: int = 5,
+                 n_hidden_layers: int = None,
+                 hidden_dim: int = None,
                  **kwargs):
         transformer: ScalarTransformer = UnconstrainedMonotonicNeuralNetwork(
             event_shape=event_shape,
diff --git a/normalizing_flows/bijections/finite/autoregressive/transformers/integration/unconstrained_monotonic_neural_network.py b/normalizing_flows/bijections/finite/autoregressive/transformers/integration/unconstrained_monotonic_neural_network.py
index 3036dc7..2290662 100644
--- a/normalizing_flows/bijections/finite/autoregressive/transformers/integration/unconstrained_monotonic_neural_network.py
+++ b/normalizing_flows/bijections/finite/autoregressive/transformers/integration/unconstrained_monotonic_neural_network.py
@@ -32,14 +32,19 @@ class UnconstrainedMonotonicNeuralNetwork(UnconstrainedMonotonicTransformer):
     """
     def __init__(self,
                  event_shape: Union[torch.Size, Tuple[int, ...]],
-                 n_hidden_layers: int = 2,
+                 n_hidden_layers: int = None,
                  hidden_dim: int = None):
         super().__init__(event_shape, g=self.neural_network_forward, c=torch.tensor(-100.0))
+
+        if n_hidden_layers is None:
+            n_hidden_layers = 1
         self.n_hidden_layers = n_hidden_layers
+
         if hidden_dim is None:
-            hidden_dim = max(5 * int(math.log(self.n_dim)), 4)
+            hidden_dim = max(int(math.log(self.n_dim)), 4)
         self.hidden_dim = hidden_dim
-        self.const = 1000  # for stability
+
+        self.const = 1  # for stability
 
         # weight, bias have self.hidden_dim elements
         self.n_input_params = 2 * self.hidden_dim
@@ -118,10 +123,8 @@ def neural_network_forward(inputs, parameters: List[torch.Tensor]):
 
     def base_forward_1d(self, x: torch.Tensor, params: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
         x_r = x.view(-1, 1, 1)
-        integral_flat = self.integral(x_r, params)
-        log_det_flat = self.g(x_r, params).log()  # We can apply log since g is always positive
-        output = integral_flat.view_as(x)
-        log_det = log_det_flat.view_as(x)
+        output = self.integral(x_r, params).view_as(x)
+        log_det = self.g(x_r, params).log().view_as(x)  # We can apply log since g is always positive
         return output, log_det
 
     def inverse_1d(self, z: torch.Tensor, h: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:

From 8dcea8bdb8b641db63758c74547f216819e0f820 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Mon, 8 Jan 2024 16:50:07 +0100
Subject: [PATCH 27/30] Add variational fit

---
 normalizing_flows/flows.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/normalizing_flows/flows.py b/normalizing_flows/flows.py
index ac5f5d5..451cf39 100644
--- a/normalizing_flows/flows.py
+++ b/normalizing_flows/flows.py
@@ -248,26 +248,22 @@ def compute_batch_loss(batch_, reduction: callable = torch.mean):
             self.load_state_dict(best_weights)
 
     def variational_fit(self,
-                        target,
-                        n_epochs: int = 10,
-                        lr: float = 0.01,
+                        target_log_prob: callable,
+                        n_epochs: int = 500,
+                        lr: float = 0.05,
                         n_samples: int = 1000,
                         show_progress: bool = False):
-        # target must have a .sample method that takes as input the batch shape
+        iterator = tqdm(range(n_epochs), desc='Variational NF fit', disable=not show_progress)
         optimizer = torch.optim.AdamW(self.parameters(), lr=lr)
-        if show_progress:
-            iterator = tqdm(range(n_epochs), desc='Variational NF fit')
-        else:
-            iterator = range(n_epochs)
-        for i in iterator:
-            x_train = target.sample((n_samples,)).to(self.loc.device)  # TODO context!
+
+        for _ in iterator:
             optimizer.zero_grad()
-            loss = -self.log_prob(x_train).mean()
+            loss = -torch.mean(target_log_prob(self.sample(n_samples)))
+            if hasattr(self.bijection, 'regularization'):
+                loss += self.bijection.regularization()
             loss.backward()
             optimizer.step()
-
-            if show_progress:
-                iterator.set_postfix_str(f'loss: {float(loss):.4f}')
+            iterator.set_postfix_str(f'Variational loss: {loss:.4f}')
 
 
 class DDNF(Flow):

From b17da392cb7ef7498939c270f079279b5a4ca834 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Mon, 8 Jan 2024 17:36:02 +0100
Subject: [PATCH 28/30] Fix SVI, add docs

---
 examples/Variational inference.md | 46 +++++++++++++++++++++++++++++++
 normalizing_flows/flows.py        | 20 +++++++++++++-
 2 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 examples/Variational inference.md

diff --git a/examples/Variational inference.md b/examples/Variational inference.md
new file mode 100644
index 0000000..8d02ee9
--- /dev/null
+++ b/examples/Variational inference.md	
@@ -0,0 +1,46 @@
+We show how to fit normalizing flows using stochastic variational inference (SVI). Whereas traditional maximum
+likelihood estimation requires a fixed dataset of samples, SVI lets us optimize NF parameters with the unnormalized
+target log density function.
+
+As an example, we define the unnormalized log density of a diagonal Gaussian. We assume this target has 10 dimensions
+with mean 5 and variance 9 in each dimension:
+
+```python
+import torch
+
+torch.manual_seed(0)
+
+event_shape = (10,)
+true_mean = torch.full(size=event_shape, fill_value=5.0)
+true_variance = torch.full(size=event_shape, fill_value=9.0)
+
+
+def target_log_prob(x: torch.Tensor):
+    return torch.sum(-((x - true_mean) ** 2 / (2 * true_variance)), dim=1)
+```
+
+We define the flow and run the variational fit:
+
+```python
+from normalizing_flows import Flow
+from normalizing_flows.bijections import RealNVP
+
+torch.manual_seed(0)
+flow = Flow(RealNVP(event_shape=event_shape))
+flow.variational_fit(target_log_prob, show_progress=True)
+```
+
+We plot samples from the trained flow. We also print estimated marginal means and variances. We see that the estimates are roughly accurate.
+```python
+import matplotlib.pyplot as plt
+
+torch.manual_seed(0)
+x_flow = flow.sample(10000).detach()
+
+plt.figure()
+plt.scatter(x_flow[:, 0], x_flow[:, 1])
+plt.show()
+
+print(f'{torch.mean(x_flow, dim=0) = }')
+print(f'{torch.var(x_flow, dim=0) = }')
+```
\ No newline at end of file
diff --git a/normalizing_flows/flows.py b/normalizing_flows/flows.py
index 451cf39..55b3760 100644
--- a/normalizing_flows/flows.py
+++ b/normalizing_flows/flows.py
@@ -253,12 +253,30 @@ def variational_fit(self,
                         lr: float = 0.05,
                         n_samples: int = 1000,
                         show_progress: bool = False):
+        """
+        Train a normalizing flow with stochastic variational inference.
+        Stochastic variational inference lets us train a normalizing flow using the unnormalized target log density
+        instead of a fixed dataset.
+
+        Refer to Rezende, Mohamed: "Variational Inference with Normalizing Flows" (2015) for more details
+        (https://arxiv.org/abs/1505.05770, loss definition in Equation 15, training pseudocode for conditional flows in
+         Algorithm 1).
+
+        :param callable target_log_prob: function that computes the unnormalized target log density for a batch of
+        points. Receives input batch with shape = (*batch_shape, *event_shape) and outputs batch with
+         shape = (*batch_shape).
+        :param int n_epochs: number of training epochs.
+        :param float lr: learning rate for the AdamW optimizer.
+        :param float n_samples: number of samples to estimate the variational loss in each training step.
+        :param bool show_progress: if True, show a progress bar during training.
+        """
         iterator = tqdm(range(n_epochs), desc='Variational NF fit', disable=not show_progress)
         optimizer = torch.optim.AdamW(self.parameters(), lr=lr)
 
         for _ in iterator:
             optimizer.zero_grad()
-            loss = -torch.mean(target_log_prob(self.sample(n_samples)))
+            flow_x, flow_log_prob = self.sample(n_samples, return_log_prob=True)
+            loss = -torch.mean(target_log_prob(flow_x) + flow_log_prob)
             if hasattr(self.bijection, 'regularization'):
                 loss += self.bijection.regularization()
             loss.backward()

From 9b87f8fe6ed111e0fe6eb91685270dd34d26eee0 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Mon, 8 Jan 2024 17:38:12 +0100
Subject: [PATCH 29/30] Skip UMNN tests

---
 test/test_umnn.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/test_umnn.py b/test/test_umnn.py
index c6fe481..182da27 100644
--- a/test/test_umnn.py
+++ b/test/test_umnn.py
@@ -7,6 +7,7 @@
     UnconstrainedMonotonicNeuralNetwork
 
 
+@pytest.mark.skip(reason="Not finalized")
 @pytest.mark.parametrize('batch_shape', [(1,), (2,), (5,), (2, 4), (100,), (5, 1, 6, 7), (3, 13, 8)])
 @pytest.mark.parametrize('event_shape', [(2,), (3,), (2, 4), (25,)])
 def test_umnn(batch_shape: Tuple, event_shape: Tuple):
@@ -27,6 +28,7 @@ def test_umnn(batch_shape: Tuple, event_shape: Tuple):
         f"{torch.max(torch.abs(log_det_forward+log_det_inverse)) = }"
 
 
+@pytest.mark.skip(reason="Not finalized")
 def test_umnn_forward():
     torch.manual_seed(0)
     event_shape = (1,)
@@ -46,6 +48,7 @@ def test_umnn_forward():
     assert torch.allclose(torch.as_tensor([log_det_forward0, log_det_forward1]), log_det_forward)
 
 
+@pytest.mark.skip(reason="Not finalized")
 def test_umnn_inverse():
     torch.manual_seed(0)
     event_shape = (1,)
@@ -65,6 +68,7 @@ def test_umnn_inverse():
     assert torch.allclose(torch.as_tensor([log_det_inverse0, log_det_inverse1]), log_det_inverse)
 
 
+@pytest.mark.skip(reason="Not finalized")
 def test_umnn_reconstruction():
     torch.manual_seed(0)
     event_shape = (1,)
@@ -96,6 +100,7 @@ def test_umnn_reconstruction():
     assert torch.allclose(log_det_forward, -log_det_inverse, atol=1e-4)
 
 
+@pytest.mark.skip(reason="Not finalized")
 def test_umnn_forward_large_event():
     torch.manual_seed(0)
     event_shape = (2,)
@@ -117,6 +122,7 @@ def test_umnn_forward_large_event():
     assert torch.allclose(torch.as_tensor([log_det_forward0, log_det_forward1, log_det_forward2]), log_det_forward)
 
 
+@pytest.mark.skip(reason="Not finalized")
 def test_umnn_inverse_large_event():
     torch.manual_seed(0)
     event_shape = (2,)
@@ -138,6 +144,7 @@ def test_umnn_inverse_large_event():
     assert torch.allclose(torch.as_tensor([log_det_inverse0, log_det_inverse1, log_det_inverse2]), log_det_inverse)
 
 
+@pytest.mark.skip(reason="Not finalized")
 def test_umnn_reconstruction_large_event():
     torch.manual_seed(0)
     event_shape = (2,)

From f555263044b7ce7ed25df508ed40597f366ed573 Mon Sep 17 00:00:00 2001
From: David Nabergoj <davidnabergoj4@gmail.com>
Date: Mon, 8 Jan 2024 17:40:53 +0100
Subject: [PATCH 30/30] Skip UMNN transformer tests

---
 test/test_reconstruction_transformers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_reconstruction_transformers.py b/test/test_reconstruction_transformers.py
index 352cd69..62c6107 100644
--- a/test/test_reconstruction_transformers.py
+++ b/test/test_reconstruction_transformers.py
@@ -84,6 +84,7 @@ def test_spline(transformer_class: ScalarTransformer, batch_shape: Tuple, event_
     assert_valid_reconstruction(transformer, x, h)
 
 
+@pytest.mark.skip(reason="Not finalized")
 @pytest.mark.parametrize('transformer_class', [
     UnconstrainedMonotonicNeuralNetwork
 ])