diff --git a/bias_transfer/__init__.py b/__init__.py
similarity index 100%
rename from bias_transfer/__init__.py
rename to __init__.py
diff --git a/bias_transfer/analysis/plot_shahd.py b/bias_transfer/analysis/plot_shahd.py
deleted file mode 100644
index 7a35f9d..0000000
--- a/bias_transfer/analysis/plot_shahd.py
+++ /dev/null
@@ -1,484 +0,0 @@
-import math
-import string
-
-from .plot import plot
-import json
-import pickle as pkl
-import pandas as pd
-import seaborn as sns
-import numpy as np
-import matplotlib.pyplot as plt
-import statsmodels.api as stats_model
-import statsmodels.formula.api as smf
-import matplotlib.transforms
-
-
-def plot_robustness(
-    models,
-    folder_name,
-    test_set,
-    levels,
-    std=False,
-    name_map={},
-    noises=[],
-    noise_grouping={},
-    plot_overview=False,
-    plot_correlation=False,
-    plot_correlation_bootstrapped=False,
-    plot_individual=True,
-    plot_per_noise_robustness=False,
-    add_overview_to_groups=False,
-    **kwargs,
-):
-
-    if plot_individual:
-        means = {}
-        stds = {}
-        for model in models:
-            with open(
-                f"./{folder_name}/{model}_all_seeds_{test_set}_bootstrapped_stds.json",
-                "r",
-            ) as fp:
-                data = json.load(fp)
-            stds[model] = data
-            with open(
-                f"./{folder_name}/{model}_all_seeds_{test_set}_bootstrapped_means.json",
-                "r",
-            ) as fp:
-                data = json.load(fp)
-            means[model] = data
-        for k in stds:
-            stds[k] = stds[k]["model"]
-            means[k] = means[k]["model"]
-    else:
-        stds = None
-        means = None
-    models = models[1:]
-
-    if plot_per_noise_robustness:
-        with open(
-            f"./{folder_name}/robust_scores_per_noise.json",
-            "r",
-        ) as fp:
-            per_noise_data_ = json.load(fp)
-        per_noise_data_ = {
-            name_map[k]: {
-                k2: {"mean": v2["mean"] * 100, "std": v2["std"] * 100}
-                for k2, v2 in v.items()
-            }
-            for k, v in per_noise_data_.items()
-        }
-        per_noise_data = {}
-        if noise_grouping:
-            for model in models:
-                model = name_map[model]
-                results = per_noise_data_[model]
-                per_noise_data[model] = {}
-                for group_name, group_items in noise_grouping.items():
-                    per_noise_data[model][name_change(group_name)] = {
-                        "mean": 0,
-                        "std": 0,
-                    }
-                    for noise in group_items:
-                        per_noise_data[model][name_change(group_name)][
-                            "mean"
-                        ] += results[noise]["mean"]
-                        per_noise_data[model][name_change(group_name)]["std"] += (
-                            results[noise]["std"] ** 2
-                        )
-                    per_noise_data[model][name_change(group_name)]["mean"] /= len(
-                        group_items
-                    )
-                    per_noise_data[model][name_change(group_name)]["std"] = math.sqrt(
-                        per_noise_data[model][name_change(group_name)]["std"]
-                        / len(group_items)
-                    )
-
-        else:
-            for model in models:
-                model = name_map[model]
-                results = per_noise_data_[model]
-                per_noise_data[model] = {
-                    name_change(k): v
-                    for k, v in sorted(
-                        results.items(), key=lambda item: noises.index(item[0])
-                    )
-                }
-    else:
-        per_noise_data = {}
-
-    if plot_correlation_bootstrapped:
-        with open(f"./{folder_name}/mtl_bootstrapped" + ".pkl", "rb") as f:
-            clean_and_neural = pkl.load(f)
-        with open(
-            f"./{folder_name}/robust_scores_scatter_bootstrapped.json", "r"
-        ) as fp:
-            robust_scores = json.load(fp)
-        for br, scores in robust_scores.items():
-            scores.update(clean_and_neural[int(br)])
-            robust_scores[br] = scores
-        print("ROBUST", robust_scores)
-        corrupt_list = []
-        corrupt_err_list = []
-        neurals = []
-        neurals_err_list = []
-        imgcls = []
-        for key in robust_scores.keys():
-            corrupt_list.append(robust_scores[key]["mean_score"])
-            corrupt_err_list.append(robust_scores[key]["score_standard_err"])
-            neurals.append(robust_scores[key]["mean_neural"])
-            neurals_err_list.append(robust_scores[key]["std_neural"])
-            imgcls.append(robust_scores[key]["mean_cls"])
-        corrupt_list = np.array(corrupt_list) * 100
-        corrupt_err_list = np.array(corrupt_err_list) * 100
-
-        robustness_data = pd.DataFrame(
-            {
-                "Neural": neurals,
-                "Robustness": corrupt_list,
-                "robustness err": corrupt_err_list,
-                "neural err": neurals_err_list,
-                "Clean": imgcls,
-                "category": ["MTL" for _ in neurals],
-            }
-        )
-    elif plot_correlation:
-        with open(f"./{folder_name}/mtl" + ".pkl", "rb") as f:
-            clean_and_neural = pkl.load(f)
-        with open(f"./{folder_name}/robust_scores_scatter_seeds.json", "r") as fp:
-            robust_scores = json.load(fp)
-        for br, scores in robust_scores.items():
-            scores.update(clean_and_neural[int(br)])
-            robust_scores[br] = scores
-        print("ROBUST", robust_scores)
-        corrupt_list = []
-        neurals = []
-        imgcls = []
-        for key in robust_scores.keys():
-            corrupt_list += robust_scores[key]["scores"]
-            neurals += robust_scores[key]["neural"]
-            imgcls += robust_scores[key]["mean_cls"]
-        corrupt_list = np.array(corrupt_list) * 100
-
-        robustness_data = pd.DataFrame(
-            {
-                "Neural": neurals,
-                "Robustness": corrupt_list,
-                "Clean": imgcls,
-                "category": ["MTL" for _ in neurals],
-            }
-        )
-    else:
-        robustness_data = None
-
-    if plot_overview or add_overview_to_groups:
-        overview_data = {
-            "tin_baseline": {"mean": 1.0, "std": 0.0},
-            "tin_mtl": {
-                "mean": 1.141074196151724,
-                "std": 0.04030340549401311,
-            },
-            "tin_mtl_shuffled": {
-                "mean": 0.9632811867144085,
-                "std": 0.027942464073949062,
-            },
-            "tin_mtl_simulated": {
-                "mean": 1.2174372498105903,
-                "std": 0.044304298544084894,
-            },
-            "tin_oracle": {
-                "mean": 1.2374721551434498,
-                "std": 0.037550730118767965,
-            },
-        }
-        overview_data = {
-            name_map[k]: {k2: v2 * 100 for k2, v2 in v.items()}
-            for k, v in overview_data.items()
-        }
-        if add_overview_to_groups and per_noise_data:
-            for model, results in overview_data.items():
-                per_noise_data[model]["Mean"] = results
-            overview_data = {}
-    else:
-        overview_data = {}
-
-    _plot(
-        noises=noises,
-        means=means,
-        stds=stds,
-        levels_list=levels,
-        name_map=name_map,
-        robustness_data=robustness_data,
-        robustness_overview=overview_data,
-        robustness_per_noise=per_noise_data,
-        **kwargs,
-    )
-
-
-def name_change(old_name, prefix=""):
-    name = old_name.replace("->", " → ")
-    name = name.replace("_", " ")
-    name = " ".join([n.capitalize() for n in name.split()])
-    return prefix + name
-
-
-@plot
-def _plot(
-    fig,
-    ax,
-    noises,
-    means,
-    stds,
-    levels_list,
-    name_map,
-    robustness_data,
-    robustness_overview,
-    robustness_per_noise,
-):
-    colors = {
-        "Baseline": "#000000",
-        "MTL-Oracle": "#2578B3",
-        "Oracle": "#A6CEE3",
-        "MTL-Shuffled": "#FB9A99",
-        "MTL-Monkey": "#E31E1B",
-    }
-
-    row, col = 0, 0
-    if means is not None:
-        for i, cat in enumerate(noises):
-            levels = pd.DataFrame(columns=["model", "category", "level", "mean", "std"])
-            for model in means.keys():
-                means_ordered = {
-                    float(level): v for level, v in means[model][cat].items()
-                }
-                stds_ordered = {
-                    float(level): v for level, v in stds[model][cat].items()
-                }
-                levels = levels.append(
-                    pd.DataFrame(
-                        {
-                            "model": name_map[model],
-                            "category": cat,
-                            "level": list(means_ordered.keys()),
-                            "mean": list(means_ordered.values()),
-                            "std": list(stds_ordered.values()),
-                        }
-                    )
-                )
-            d = levels.groupby("category").get_group(cat)
-            d_mean = d.pivot(index="level", columns="model", values=["mean", "std"])
-            plot = d_mean["mean"].plot(
-                ax=ax[row][col], legend=False, yerr=d_mean["std"], color=colors
-            )
-            ax[row][col].set_title(name_change(cat))
-            plot.set_xticks(levels_list)
-            plot.set_xticklabels(levels_list)
-            if row == len(ax)-1:
-                plot.set_xlabel("Corruption Severity")
-            else:
-                plot.set_xlabel(None)
-            if col == 0:
-                plot.set_ylabel("Accuracy [%]")
-
-            # ax[row][col].set_ylim([0, 50])
-            ax[row][col].grid(True, linestyle=":")
-
-            col = (col + 1) % len(ax[row])
-            if col == 0:
-                row += 1
-
-        ax[-1][-1].axis("off")
-
-        handles, labels = ax[0][0].get_legend_handles_labels()
-        new_labels = ["Baseline", "MTL-Monkey", "MTL-Shuffled", "MTL-Oracle", "Oracle"]
-        new_handles = []
-        for label in new_labels:
-            new_handles.append(handles[labels.index(label)])
-        fig.legend(new_handles, new_labels, loc=(0.05, 0.96), ncol=6, frameon=False)
-        fig.tight_layout()
-
-    if robustness_per_noise:
-        plot_per_category(
-            ax,
-            col,
-            colors,
-            fig,
-            robustness_per_noise,
-            row,
-            despine=True,
-        )
-        ax[row][col].set_ylabel("Robustness Score [%]")
-        box = ax[row][col].get_position()
-        box.x0 = box.x0 - 0.035
-        box.x1 = box.x1 - 0.035
-        ax[row][col].set_position(box)
-        col = (col + 1) % len(ax[row])
-        if col == 0:
-            row += 1
-
-    if robustness_overview:
-        if robustness_per_noise:
-            overview_data = {
-                m: {"Total": result} for m, result in robustness_overview.items()
-            }
-            del overview_data["Baseline"]
-            plot_per_category(ax, col, colors, fig, overview_data, row, despine=False)
-            ax[row][col].set_ylabel("")
-            ax[row][col].axes.get_yaxis().set_visible(False)
-            box = ax[row][col].get_position()
-            box.x0 = box.x0 - 0.065
-            box.x1 = box.x1 - 0.065
-            ax[row][col].set_position(box)
-            col = (col + 1) % len(ax[row])
-            if col == 0:
-                row += 1
-        else:
-            ax[row][col].axhline(y=100, color=colors["Baseline"], label="Baseline")
-            plot = sns.barplot(
-                x=list(robustness_overview.keys())[1:],
-                y=[y["mean"] for y in robustness_overview.values()][1:],
-                yerr=[y["std"] for y in robustness_overview.values()][1:],
-                ax=ax[row][col],
-                palette=colors,
-            )
-            plot.set_xlabel("")
-            plot.set_ylabel("Robustness Score [%]")
-
-            ax[row][col].grid(True, linestyle=":")
-            # ax[row][col].set_ylim([50, 140])
-            fig.tight_layout()
-            sns.despine(offset=3, trim=False)
-            # plt.setp(ax[row][col].xaxis.get_majorticklabels(), rotation=30, ha="right")
-            plt.setp(ax[row][col].xaxis.get_majorticklabels(), rotation=-40, ha="left")
-            # Create offset transform by 5 points in x direction
-            dx = -1 / 72.0
-            dy = 0 / 72.0
-            offset = matplotlib.transforms.ScaledTranslation(
-                dx, dy, fig.dpi_scale_trans
-            )
-            # apply offset transform to all x ticklabels.
-            for label in ax[row][col].xaxis.get_majorticklabels():
-                label.set_transform(label.get_transform() + offset)
-            col = (col + 1) % len(ax[row])
-            if col == 0:
-                row += 1
-
-    if robustness_data is not None:
-        if "robustness err" in robustness_data.columns:
-            markers, caps, bars = ax[row][col].errorbar(
-                robustness_data["Neural"],
-                robustness_data["Robustness"],
-                yerr=robustness_data["robustness err"],
-                xerr=robustness_data["neural err"],
-                linestyle="None",
-                zorder=-32,
-            )
-            # loop through bars and caps and set the alpha value
-            [bar.set_alpha(0.5) for bar in bars]
-            [cap.set_alpha(0.5) for cap in caps]
-
-        m, b = np.polyfit(robustness_data["Neural"], robustness_data["Robustness"], 1)
-        ax[row][col].plot(robustness_data["Neural"], m * robustness_data["Neural"] + b, color="grey")
-        # Get significance values for correlation:
-        mod = smf.ols(formula="Robustness ~ Clean * Neural", data=robustness_data)
-        res = mod.fit()
-        print("Robustness ~ Clean * Neural")
-        print(res.summary())
-
-        mod = smf.ols(formula="Robustness ~ Clean + Neural", data=robustness_data)
-        res = mod.fit()
-        print("Robustness ~ Clean + Neural")
-        print(res.summary())
-        for i in range(3):
-            print(res.pvalues[i])
-
-        plot = sns.scatterplot(
-            data=robustness_data,
-            x="Neural",
-            y="Robustness",
-            # sizes=(300, 900),
-            ax=ax[row][col],
-            hue="Clean",
-            palette="rocket_r",
-            legend=False,
-        )
-
-        norm = plt.Normalize(
-            robustness_data["Clean"].min(),
-            robustness_data["Clean"].max(),
-        )
-        sm = plt.cm.ScalarMappable(cmap="rocket_r", norm=norm)
-        sm.set_array([])
-        cbar = ax[row][col].figure.colorbar(sm)
-        cbar.set_label("Clean Accuracy [%]", rotation=270, labelpad=10)
-
-        plot.set_xlabel("Neural Prediction [corr]")
-        plot.set_ylabel("Robustness Score [%]")
-
-        ax[row][col].grid(True, linestyle=":")
-
-        # fig.tight_layout()
-
-
-def plot_per_category(ax, col, colors, fig, robustness_per_noise, row, despine=False):
-    df = pd.concat(
-        {k: pd.DataFrame(v).T for k, v in robustness_per_noise.items()}, axis=0
-    )
-    df.reset_index(inplace=True)
-    df.columns = ["Model", "Corruption", "Robustness", "std"]
-    data_up = df.copy()
-    data_down = df.copy()
-    data_up["Robustness"] = data_up["Robustness"] + data_up["std"]
-    data_down["Robustness"] = data_down["Robustness"] - data_down["std"]
-    df = pd.concat([data_up, data_down])
-    ax[row][col].axhline(y=100, color=colors["Baseline"], label="Baseline")
-    plot = sns.barplot(
-        x="Corruption",
-        y="Robustness",
-        hue="Model",
-        data=df,
-        ax=ax[row][col],
-        # yerr=df["std"],
-        palette=colors,
-    )
-    # patches = sorted(plot.patches, key=lambda patch: patch.get_x())
-    # for i, bar in enumerate(patches[-4:]):
-    #     if i == 0:
-    #         plt.axvline(x=bar.get_x(), color="grey", linestyle=":")
-    #     bar.set_x(bar.get_x()+ bar.get_width())
-    plot.set_xlabel("")
-    ax[row][col].grid(True, linestyle=":")
-    ax[row][col].set_ylim([50, 150])
-    handles, labels = ax[row][col].get_legend_handles_labels()
-    ax[row][col].get_legend().remove()
-    new_labels = ["Baseline", "MTL-Monkey", "MTL-Shuffled", "MTL-Oracle", "Oracle"]
-    new_handles = []
-    for label in new_labels:
-        new_handles.append(handles[labels.index(label)])
-    fig.legend(new_handles, new_labels, loc=(0.01, 0.92), ncol=6, frameon=False)
-    # fig.tight_layout()
-    if despine:
-        sns.despine(offset=3, trim=False)
-    # plt.setp(ax[row][col].xaxis.get_majorticklabels(), rotation=30)
-    # for label in ax[row][col].get_xticklabels():
-    #     label.set_horizontalalignment('center')
-    # ax[row][col].setp(ax[row][col].xaxis.get_majorticklabels(), rotation=-45)
-    # ax[row][col].set_xticklabels(ax[row][col].get_xticks(), rotation=-45)
-
-    # for tick in ax[row][col].get_xticklabels():
-    #     tick.set_rotation(-45)
-    # dx = 1 / 72.0
-    # dy = 5 / 72.0
-    # offset = matplotlib.transforms.ScaledTranslation(dx, dy, fig.dpi_scale_trans)
-    # for label in ax[row][col].xaxis.get_majorticklabels():
-    #     label.set_transform(label.get_transform() + offset)
-
-    for tick in ax[row][col].get_xticklabels():
-        tick.set_rotation(-45)
-    # Create offset transform by 5 points in x direction
-    dx = 2 / 72.0
-    dy = 4 / 72.0
-    offset = matplotlib.transforms.ScaledTranslation(dx, dy, fig.dpi_scale_trans)
-    # apply offset transform to all x ticklabels.
-    for label in ax[row][col].xaxis.get_majorticklabels():
-        label.set_transform(label.get_transform() + offset)
diff --git a/bias_transfer/analysis/representation/analyzer.py b/bias_transfer/analysis/representation/analyzer.py
deleted file mode 100644
index 7513084..0000000
--- a/bias_transfer/analysis/representation/analyzer.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import os
-import numpy as np
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-import seaborn as sns
-import pandas as pd
-import torch
-from torch import nn
-from torch.backends import cudnn
-
-import bias_transfer.trainer.trainer
-from bias_transfer.analysis.plot import plot_preparation, save_plot
-
-
-class RepresentationAnalyzer:
-    def __init__(
-        self,
-        experiment,
-        table,
-        name: str,
-        dataset: str = "val",
-        base_path: str = "/work/analysis/",
-    ):
-        self.experiment = experiment
-        self.dataset = dataset
-        # data_loaders, self.model, self.trainer = (
-        #                                                  table & experiment.get_restrictions()
-        #                                          ).restore_saved_state(,,
-        #                                          self.num_samples = -1
-        # self.sample_loader = torch.utils.data.DataLoader(
-        #     data_loaders[dataset]["img_classification"].dataset,
-        #     sampler=data_loaders[dataset]["img_classification"].sampler,
-        #     batch_size=64,
-        #     shuffle=False,
-        #     num_workers=1,
-        #     pin_memory=False,
-        # )
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.model = self.model.to(self.device)
-        self._reset_seed()
-        self.criterion = nn.CrossEntropyLoss()
-        self.base_path = base_path
-        self.name = name
-
-    def _reset_seed(self):
-        torch.manual_seed(42)
-        np.random.seed(42)
-        if self.device == "cuda":
-            cudnn.benchmark = False
-            cudnn.deterministic = True
-            torch.cuda.manual_seed(42)
-
-    def _compute_representation(self, main_loop_modules):
-        (
-            acc,
-            loss,
-            module_losses,
-            collected_outputs,
-        ) = bias_transfer.trainer.main_loop.main_loop(
-            self.model,
-            self.criterion,
-            self.device,
-            None,
-            self.sample_loader,
-            0,
-            main_loop_modules,
-            train_mode=False,
-            return_outputs=True,
-        )
-        outputs = [o[self.rep_name] for o in collected_outputs]
-        print("Acc:", acc, "Loss:", loss, flush=True)
-        return torch.cat(outputs), acc
-
-    def get_file_name(self, method, rep_name):
-        return os.path.join(self.base_path, "_".join([self.name, rep_name, method]))
-
-    def save_matrix(self, to_save, method, rep_name):
-        name = self.get_file_name(method, rep_name) + ".npy"
-        if not os.path.isdir(self.base_path):
-            os.mkdir(self.base_path)
-        np.save(os.path.join(self.base_path, name), to_save)
-
-    def load_matrix(self, method, rep_name):
-        name = self.get_file_name(method, rep_name) + ".npy"
-        file = os.path.join(self.base_path, name)
-        if os.path.isfile(file):
-            print("Found existing {} result that will be loaded now".format(method))
-            return np.load(file)
-        return None
-
-    def plot_matrix(
-        self,
-        matrix_df,
-        title,
-        fig=None,
-        axs=None,
-        save="",
-        min=None,
-        max=None,
-        cbar_outside=True,
-    ):
-        if not fig or not axs:
-            fig, axs = plot_preparation(ratio=(4, 4), style="nips")
-        fig.tight_layout()  # Or equivalently,  "plt.tight_layout()"
-        if cbar_outside:
-            cbar_ax = fig.add_axes([0.90, 0.2, 0.02, 0.4])  # [left, bottom, width, height]
-        sns.heatmap(
-            matrix_df,
-            cmap="YlGnBu",
-            xticklabels=10,
-            yticklabels=10,
-            vmin=min,
-            vmax=max,
-            ax=axs,
-            cbar=True,
-            cbar_ax=cbar_ax if cbar_outside else None,
-        )
-        sns.despine(offset=10, trim=True)
-        if cbar_outside:
-            fig.tight_layout(rect=[0, 0, 0.9, 1])
-        else:
-            fig.tight_layout()
-
-        st = fig.suptitle(title, fontsize=12)
-        st.set_y(1.05)
-        if save:
-            save_plot(fig,save)
diff --git a/bias_transfer/analysis/representation/correlation.py b/bias_transfer/analysis/representation/correlation.py
deleted file mode 100644
index 328521d..0000000
--- a/bias_transfer/analysis/representation/correlation.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import copy
-
-import torch
-
-from sklearn.cluster import AgglomerativeClustering
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-import os
-
-from bias_transfer.analysis.representation.analyzer import RepresentationAnalyzer
-
-#TODO!!!
-
-class CorrelationAnalyzer(RepresentationAnalyzer):
-    def _plot_corr_matrix(
-            self, mat, title="", file_name="", n_clusters=10, indices=None, acc=None
-    ):
-        fig, ax = self._plot_preparation(1, 1)
-        if indices is None:
-            clusters = AgglomerativeClustering(n_clusters=n_clusters).fit(1 - mat)
-            indices = np.argsort(clusters.labels_)
-        sns.heatmap(
-            mat[indices][:, indices],
-            cmap="YlGnBu",
-            xticklabels=400,
-            yticklabels=400,
-            vmin=0.0,
-            vmax=1.0,
-        )
-        # sns.heatmap(mat[indices][:, indices], cmap="YlGnBu", xticklabels=400, yticklabels=400)
-        sns.despine(offset=10, trim=True)
-        if title:
-            fig.suptitle(title, fontsize=16)
-        if acc:
-            ax.text(
-                0.82, 0.93, "Accuracy: {:02.2f}".format(acc), transform=ax.transAxes
-            )
-        if file_name:
-            fig.savefig(
-                os.path.join(self.base_path, file_name),
-                facecolor=fig.get_facecolor(),
-                edgecolor=fig.get_edgecolor(),
-                bbox_inches="tight",
-            )
-            plt.close(fig)
-        return indices
-
-
-    def _compute_corr_matrix(self, x, mode, noise_level):
-        result = self._load_representation("corr", mode, noise_level)
-        if result is None:
-            x_flat = x.flatten(1, -1)
-            # centered = (x_flat - x_flat.mean()) / x_flat.std()
-            # result = (centered @ centered.transpose(0, 1)) / x_flat.size()[1]
-            centered = x_flat - x_flat.mean(dim=1).view(-1, 1)
-            result = (centered @ centered.transpose(0, 1)) / torch.ger(
-                torch.norm(centered, 2, dim=1), torch.norm(centered, 2, dim=1)
-            )  # see https://de.mathworks.com/help/images/ref/corr2.html
-            print(torch.max(result))
-            result = result.detach().cpu()
-            self._save_representation(result, "corr", mode, noise_level)
-        return result
-
-
-    def corr_matrix(
-            self, mode="clean", noise_level=0.0, clean_rep=None, *args, **kwargs
-    ):
-        self.clean_vs_noisy(noise_level=noise_level)
-        title = "Correlation matrix for rep from {} data ".format(mode)
-        if mode == "noisy":
-            corr_matrix = self._compute_corr_matrix(
-                self.noisy_rep[0], mode, noise_level
-            )
-            title += "(std = {:01.2f})".format(noise_level)
-            acc = self.noisy_rep[1]
-        else:
-            corr_matrix = self._compute_corr_matrix(self.clean_rep[0], mode, 0.0)
-            acc = self.clean_rep[1]
-
-        clean_rep = self._plot_corr_matrix(
-            corr_matrix,
-            title=title + "\n" + "Model: " + self.experiment.comment,
-            file_name=self._get_name("corr", mode, noise_level) + "_plot",
-            indices=clean_rep,
-            acc=acc,
-        )
-        return clean_rep
diff --git a/bias_transfer/analysis/representation/dim_reduction.py b/bias_transfer/analysis/representation/dim_reduction.py
deleted file mode 100644
index a445cf3..0000000
--- a/bias_transfer/analysis/representation/dim_reduction.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import copy
-
-import torch
-
-import bias_transfer.trainer.trainer
-from torch import nn
-from bias_transfer.trainer.main_loop_modules.noise_augmentation import NoiseAugmentation
-import numpy as np
-import pandas as pd
-from sklearn.decomposition import PCA
-from sklearn.manifold import TSNE
-import matplotlib.pyplot as plt
-import seaborn as sns
-import os
-
-
-#TODO!!!!
-
-def run(self, method):
-    if method in ("pca", "tsne"):
-        to_run = self.dim_reduction
-    else:
-        to_run = self.corr_matrix
-    filenames = []
-    clean_rep = to_run(noise_level=0.0, method=method, mode="clean")
-    filenames.append(
-        os.path.join(self.path, self._get_name(method, "clean", 0.0) + "_plot.png")
-    )
-    for i in range(1, 21):
-        noise_level = 0.05 * i
-        to_run(
-            noise_level=noise_level,
-            method=method,
-            mode="noisy",
-            clean_rep=clean_rep,
-        )
-        filenames.append(
-            os.path.join(
-                self.path,
-                self._get_name(method, "noisy", noise_level) + "_plot.png",
-            )
-        )
-    self._generate_gif(filenames, self._get_name(method=method))
-
-def clean_vs_noisy(self, noise_level=0.0):
-    print("==> Computing Representations", flush=True)
-    self._reset_seed()
-    if self.clean_rep is None:
-        # Representations form clean data:
-        print("Compute representation of clean input", flush=True)
-        self.clean_rep = self._compute_representation([])
-    else:
-        print("Representation of clean input already in memory")
-
-    # Representations from noisy data:
-    print("Compute representation of noisy input", flush=True)
-    self._reset_seed()
-    experiment = copy.deepcopy(self.experiment)
-    bias_transfer.trainer.trainer.trainer.noise_std = {noise_level: 1.0}
-    main_loop_modules = [
-        NoiseAugmentation(
-            config=bias_transfer.trainer.trainer.trainer,
-            device=self.device,
-            data_loader=self.sample_loader,
-            seed=42,
-        )
-    ]
-    self.noisy_rep = self._compute_representation(main_loop_modules)
-
-def _cosine_loss(self, rep_1, rep_2):
-    # Compare
-    cosine_criterion = nn.CosineEmbeddingLoss()
-    return cosine_criterion(
-        rep_1, rep_2, torch.ones(rep_1.shape[:1], device=self.device)
-    )
-
-def _mse_loss(self, rep_1, rep_2):
-    mse_criterion = nn.MSELoss()
-    return mse_criterion(rep_1, rep_2)
-
-def clean_vs_noisy_distance(self, noise_level=0.0):
-    self.clean_vs_noisy(noise_level)
-    cosine = self._cosine_loss(self.clean_rep[0], self.noisy_rep[0])
-    mse = self._mse_loss(self.clean_rep[0], self.noisy_rep[0])
-    print(
-        "Clean vs. Noisy: Cosine loss:",
-        cosine.item(),
-        "MSE loss:",
-        mse.item(),
-        flush=True,
-    )
-
-def _convert_to_df(self, rep, noise_level=0.0):
-    torch.manual_seed(42)
-    np.random.seed(42)
-    if self.device == "cuda":
-        torch.cuda.manual_seed(42)
-    rep = rep.cpu()
-    targets = torch.cat([t for _, t in self.sample_loader]).cpu()
-    self.num_labels = max(targets) + 1
-    feat_cols = ["dim" + str(i) for i in range(rep.shape[1])]
-    df = pd.DataFrame(rep, columns=feat_cols)
-    df["y"] = targets
-    df["label"] = df["y"].apply(lambda i: str(i))
-    df["noise"] = np.ones_like(targets) * noise_level
-    if self.num_samples > 0:
-        # For reproducability of the results
-        np.random.seed(42)
-        rndperm = np.random.permutation(df.shape[0])
-        df = df.loc[rndperm[: self.num_samples], :].copy()
-    return df, feat_cols
-
-def _clean_vs_noisy_df(self, noise_level=0.0):
-    self.clean_vs_noisy(noise_level=noise_level)
-    if self.clean_df is None:
-        self.clean_df, self.feat_cols = self._convert_to_df(self.clean_rep[0], 0.0)
-    self.noisy_df, _ = self._convert_to_df(self.noisy_rep[0], noise_level)
-
-
-def _compute_pca(self, df, mode, noise_level, pca=None):
-    pca_result = self._load_representation("pca", mode, noise_level)
-    if pca_result is None:
-        if not pca:
-            pca = PCA(n_components=3)
-            pca.fit(df[self.feat_cols].values)
-        pca_result = pca.transform(df[self.feat_cols].values)
-        self._save_representation(pca_result, "pca", mode, noise_level)
-        print(
-            "Explained variation per principal component: {}".format(
-                pca.explained_variance_ratio_
-            ),
-            flush=True,
-        )
-    df["pca-one"] = pca_result[:, 0]
-    df["pca-two"] = pca_result[:, 1]
-    df["pca-three"] = pca_result[:, 2]
-    return pca
-
-def _compute_tsne(self, df, mode, noise_level):
-    tsne_result = self._load_representation("tsne", mode, noise_level)
-    if tsne_result is None:
-        tsne = TSNE(
-            n_components=2, verbose=1, perplexity=40, n_iter=250, init="pca"
-        )
-        tsne_result = tsne.fit_transform(df[self.feat_cols].values)
-        self._save_representation(tsne_result, "tsne", mode, noise_level)
-    df["tsne-2d-one"] = tsne_result[:, 0]
-    df["tsne-2d-two"] = tsne_result[:, 1]
-
-def _plot_dim_reduction(
-    self,
-    df,
-    data_columns,
-    num_labels=100,
-    hue="y",
-    style=None,
-    title="",
-    file_name="",
-    legend=False,
-    acc=None,
-):
-    fig, ax = self._plot_preparation(1, len(data_columns))
-    if not isinstance(ax, list):
-        ax = [ax]
-    for i, (x, y) in enumerate(data_columns):
-        sns.scatterplot(
-            x=x,
-            y=y,
-            hue=hue,
-            style=style,
-            palette=sns.color_palette("hls", num_labels),
-            data=df,
-            legend=legend,
-            s=10,
-            # ec=None,
-            ax=ax[i],
-        )
-        if acc:
-            ax[i].text(
-                0.85,
-                0.90,
-                "Accuracy: {:02.2f}".format(acc),
-                transform=ax[i].transAxes,
-            )
-    sns.despine(offset=10, trim=True)
-    if title:
-        fig.suptitle(title, fontsize=16)
-    if file_name:
-        fig.savefig(
-            os.path.join(self.path, file_name),
-            facecolor=fig.get_facecolor(),
-            edgecolor=fig.get_edgecolor(),
-            bbox_inches="tight",
-        )
-        plt.close(fig)
-
-
-def dim_reduction(
-    self, method="tsne", mode="combined", noise_level=0.0, clean_rep=None
-):
-    self._clean_vs_noisy_df(noise_level=noise_level)
-    if mode == "combined":
-        combined_df = pd.DataFrame(self.clean_df)
-        combined_df = combined_df.append(self.noisy_df, ignore_index=True)
-        df = combined_df
-        acc = self.noisy_rep[1]
-        title = "Rep noisy vs clean data "
-    elif mode == "noisy":
-        title = "Rep from noisy data (std = {:01.2f})".format(noise_level)
-        df = self.noisy_df
-        acc = self.noisy_rep[1]
-    else:
-        title = "Rep from clean data "
-        df = self.clean_df
-        acc = self.clean_rep[1]
-
-    data_columns = []
-    print("==> Computing {} representation".format(method))
-    if "tsne" in method:
-        self._compute_tsne(df, mode, noise_level)
-        data_columns.append(("tsne-2d-one", "tsne-2d-two"))
-    if "pca" in method:
-        clean_rep = self._compute_pca(df, mode, noise_level, pca=clean_rep)
-        data_columns.append(("pca-one", "pca-two"))
-
-    print("==> Plotting {} representation".format(method))
-    self._plot_dim_reduction(
-        df,
-        data_columns,
-        num_labels=self.num_labels,
-        style="noise" if "combined" in mode else None,
-        hue="y",
-        title=title + "\n" + "Model: " + self.experiment.comment,
-        file_name=self._get_name(method, mode, noise_level) + "_plot",
-        acc=acc,
-    )
-    return clean_rep
-
diff --git a/bias_transfer/analysis/representation/noise_stability.py b/bias_transfer/analysis/representation/noise_stability.py
deleted file mode 100644
index 6dc29df..0000000
--- a/bias_transfer/analysis/representation/noise_stability.py
+++ /dev/null
@@ -1,410 +0,0 @@
-import os
-import copy
-import math
-import shutil
-
-import torch
-import numpy as np
-import seaborn as sns
-import pandas as pd
-import matplotlib.pyplot as plt
-from torch.autograd import Variable
-
-from bias_transfer.analysis.plot import plot_preparation, save_plot
-from bias_transfer.models import IntermediateLayerGetter
-from bias_transfer.trainer.main_loop_modules import NoiseAugmentation
-from nnfabrik.utility.dj_helpers import make_hash
-from .analyzer import RepresentationAnalyzer
-
-ALL_REPRESENTATIONS = {
-    "conv1": "layer0.conv1",
-    # "relu": "layer0.relu",
-    # layer1
-    "layer1.0.conv1": "layer1.0.conv1",
-    "layer1.0.conv2": "layer1.0.conv2",
-    "layer1.0.conv3": "layer1.0.conv3",
-    # "layer1.0.relu": "layer1.0.relu",
-    "layer1.1.conv1": "layer1.1.conv1",
-    "layer1.1.conv2": "layer1.1.conv2",
-    "layer1.1.conv3": "layer1.1.conv3",
-    # "layer1.1.relu": "layer1.1.relu",
-    "layer1.2.conv1": "layer1.2.conv1",
-    "layer1.2.conv2": "layer1.2.conv2",
-    "layer1.2.conv3": "layer1.2.conv3",
-    # "layer1.2.relu": "layer1.2.relu",
-    # layer2
-    "layer2.0.conv1": "layer2.0.conv1",
-    "layer2.0.conv2": "layer2.0.conv2",
-    "layer2.0.conv3": "layer2.0.conv3",
-    # "layer2.0.relu": "layer2.0.relu",
-    "layer2.1.conv1": "layer2.1.conv1",
-    "layer2.1.conv2": "layer2.1.conv2",
-    "layer2.1.conv3": "layer2.1.conv3",
-    # "layer2.1.relu": "layer2.1.relu",
-    "layer2.2.conv1": "layer2.2.conv1",
-    "layer2.2.conv2": "layer2.2.conv2",
-    "layer2.2.conv3": "layer2.2.conv3",
-    # "layer2.2.relu": "layer2.2.relu",
-    "layer2.3.conv1": "layer2.3.conv1",
-    "layer2.3.conv2": "layer2.3.conv2",
-    "layer2.3.conv3": "layer2.3.conv3",
-    # "layer2.3.relu": "layer2.3.relu",
-    # layer3
-    "layer3.0.conv1": "layer3.0.conv1",
-    "layer3.0.conv2": "layer3.0.conv2",
-    "layer3.0.conv3": "layer3.0.conv3",
-    # "layer3.0.relu": "layer3.0.relu",
-    "layer3.1.conv1": "layer3.1.conv1",
-    "layer3.1.conv2": "layer3.1.conv2",
-    "layer3.1.conv3": "layer3.1.conv3",
-    # "layer3.1.relu": "layer3.1.relu",
-    "layer3.2.conv1": "layer3.2.conv1",
-    "layer3.2.conv2": "layer3.2.conv2",
-    "layer3.2.conv3": "layer3.2.conv3",
-    # "layer3.2.relu": "layer3.2.relu",
-    "layer3.3.conv1": "layer3.3.conv1",
-    "layer3.3.conv2": "layer3.3.conv2",
-    "layer3.3.conv3": "layer3.3.conv3",
-    # "layer3.3.relu": "layer3.3.relu",
-    "layer3.4.conv1": "layer3.4.conv1",
-    "layer3.4.conv2": "layer3.4.conv2",
-    "layer3.4.conv3": "layer3.4.conv3",
-    # "layer3.4.relu": "layer3.4.relu",
-    "layer3.5.conv1": "layer3.5.conv1",
-    "layer3.5.conv2": "layer3.5.conv2",
-    "layer3.5.conv3": "layer3.5.conv3",
-    # "layer3.5.relu": "layer3.5.relu",
-    # layer4
-    "layer4.0.conv1": "layer4.0.conv1",
-    "layer4.0.conv2": "layer4.0.conv2",
-    "layer4.0.conv3": "layer4.0.conv3",
-    # "layer4.0.relu": "layer4.0.relu",
-    "layer4.1.conv1": "layer4.1.conv1",
-    "layer4.1.conv2": "layer4.1.conv2",
-    "layer4.1.conv3": "layer4.1.conv3",
-    # "layer4.1.relu": "layer4.1.relu",
-    "layer4.2.conv1": "layer4.2.conv1",
-    "layer4.2.conv2": "layer4.2.conv2",
-    "layer4.2.conv3": "layer4.2.conv3",
-    # "layer4.2.relu": "layer4.2.relu",
-    # core output
-    "flatten": "core",
-    "fc": "readout",
-}
-
-
-def centering(K):
-    n = K.shape[0]
-    unit = torch.ones([n, n], device=K.device)
-    I = torch.eye(n, device=K.device)
-    H = I - unit / n
-
-    return torch.mm(
-        torch.mm(H, K), H
-    )  # HKH are the same with KH, KH is the first centering, H(KH) do the second time, results are the sme with one time centering
-    # return np.dot(H, K)  # KH
-
-
-def rbf(X, sigma=None):
-    GX = torch.dot(X, X.T)
-    KX = torch.diag(GX) - GX + (torch.diag(GX) - GX).T
-    if sigma is None:
-        mdist = torch.median(KX[KX != 0])
-        sigma = math.sqrt(mdist)
-    KX *= -0.5 / (sigma * sigma)
-    KX = torch.exp(KX)
-    return KX
-
-
-def kernel_HSIC(X, Y, sigma):
-    return torch.sum(centering(rbf(X, sigma)) * centering(rbf(Y, sigma)))
-
-
-def linear_HSIC(X, Y):
-    L_X = torch.mm(X, X.T)
-    L_Y = torch.mm(Y, Y.T)
-    return torch.sum(centering(L_X) * centering(L_Y))
-
-
-def linear_CKA(X, Y):
-    hsic = linear_HSIC(X, Y)
-    var1 = torch.sqrt(linear_HSIC(X, X))
-    var2 = torch.sqrt(linear_HSIC(Y, Y))
-
-    return hsic / (var1 * var2)
-
-
-def kernel_CKA(X, Y, sigma=None):
-    hsic = kernel_HSIC(X, Y, sigma)
-    var1 = torch.sqrt(kernel_HSIC(X, X, sigma))
-    var2 = torch.sqrt(kernel_HSIC(Y, Y, sigma))
-
-    return hsic / (var1 * var2)
-
-
-def pairwise_l2_distances(x, y=None):
-    """
-    see: https://discuss.pytorch.org/t/efficient-distance-matrix-computation/9065
-    Input: x is a Nxd matrix
-           y is an optional Mxd matirx
-    Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
-            if y is not given then use 'y=x'.
-    i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
-    """
-    x_norm = (x ** 2).sum(1).view(-1, 1)
-    if y is not None:
-        y_norm = (y ** 2).sum(1).view(1, -1)
-    else:
-        y = x
-        y_norm = x_norm.view(1, -1)
-
-    dist = x_norm + y_norm - 2.0 * torch.mm(x, torch.transpose(y, 0, 1))
-    return dist
-
-
-def RDM(X, dist_measure="corr"):
-    X = X - X.mean(dim=-1).unsqueeze(-1)
-    if dist_measure == "corr":
-        result = (X @ torch.transpose(X, 0, 1)) / torch.ger(
-            torch.norm(X, 2, dim=1), torch.norm(X, 2, dim=1)
-        )
-    elif dist_measure == "l2":
-        result = pairwise_l2_distances(X)
-    return result
-
-
-def RDM_comparison(X, Y, dist_measure="corr"):
-    RDM_X = RDM(X, dist_measure).flatten()
-    RDM_Y = RDM(Y, dist_measure).flatten()
-    result = RDM_X @ RDM_Y.T
-    result /= (X.shape[0]) ** 2
-    return result
-
-
-def similarity(X, Y, dist_measure="CKA"):
-    if dist_measure == "CKA":
-        return linear_CKA(X, Y)
-    else:
-        return RDM_comparison(X, Y, dist_measure)
-
-
-class NoiseStabilityAnalyzer(RepresentationAnalyzer):
-    def __init__(
-        self,
-        num_samples=0,
-        num_repeats=4,
-        noise_std_max=0.51,
-        noise_std_step=0.01,
-        rep_names=None,
-        dist_measures=("CKA",),
-        *args,
-        **kwargs
-    ):
-        super().__init__(*args, **kwargs)
-        if rep_names is None:
-            rep_names = ALL_REPRESENTATIONS.values()
-        self.rep_names = rep_names
-        if not num_samples:
-            self.num_samples = len(self.sample_loader.sampler)
-        else:
-            self.num_samples = num_samples
-        self.num_repeats = num_repeats
-        self.dist_measures = dist_measures
-        self.noise_stds = np.arange(0, noise_std_max, noise_std_step)
-        if isinstance(self.model, IntermediateLayerGetter):
-            self.model = self.model._model
-        self.model = IntermediateLayerGetter(self.model, ALL_REPRESENTATIONS)
-        self.accuracy = None
-        self.tmp_path = os.path.join(
-            self.base_path, "tmp" + make_hash(self.name)
-        )
-        self.num_batches = math.ceil(self.num_samples / self.sample_loader.batch_size)
-
-    def run(self):
-        noise_stabilities = {d: [] for d in self.dist_measures}
-        for rep_name in self.rep_names:
-            shutil.rmtree(self.tmp_path, ignore_errors=True, onerror=None)
-            os.makedirs(self.tmp_path)
-            self._reset_seed()
-            self._compute_representation(rep_name)
-            torch.cuda.empty_cache()
-            for dist_measure in self.dist_measures:
-                stability_matrix = self.compute_stability_matrix(dist_measure, rep_name)
-                # compute stability measure
-                noise_stabilities[dist_measure].append(
-                    np.average(stability_matrix[0, :])
-                )
-                # prepare plotting
-                stability_df = pd.DataFrame(stability_matrix)
-                stability_df.astype(float)
-                stability_df.columns = ["{:01.2f}".format(n) for n in self.noise_stds]
-                stability_df.index = ["{:01.2f}".format(n) for n in self.noise_stds]
-                fig, axs = plot_preparation(
-                    nrows=2,
-                    ncols=1,
-                    fraction=0.5,
-                    sharex=True,
-                    ratio=(0.9, 1),  # to make it quadratic
-                    gridspec_kw={"height_ratios": [1, 4]},
-                    style="nips",
-                )
-                self.plot_acc_over_noise(axs[0])
-                self.plot_matrix(
-                    matrix_df=stability_df,
-                    title=self.name + ": " + rep_name + "(" + dist_measure + ")",
-                    min=0 if dist_measure == "CKA" else None,
-                    max=1 if dist_measure == "CKA" else None,
-                    save=None,
-                    fig=fig,
-                    axs=axs[1],
-                    cbar_outside=True,
-                )
-                levels = np.arange(0, 1.0, 0.1)
-                contours = axs[1].contour(
-                    stability_matrix, colors="white", levels=levels
-                )
-                axs[1].clabel(contours, inline=True, fontsize=8)
-                save_plot(
-                    fig, self.get_file_name(dist_measure, rep_name.replace(".", "_"))
-                )
-                print("Finished {} {}-Analysis".format(rep_name, dist_measure))
-            shutil.rmtree(self.tmp_path, ignore_errors=True, onerror=None)
-        for dist_measure, stability in noise_stabilities.items():
-            fig = self.plot_noise_stability(stability)
-            save_plot(fig, self.get_file_name(dist_measure, "stability"))
-
-    def _compute_representation(self, rep_name, *args, **kwargs):
-        test_input = next(iter(self.sample_loader))[0][:1].to(self.device)
-        test_out, _ = self.model(test_input)
-        test_out = test_out[rep_name]
-        if isinstance(test_out, list):
-            print(rep_name, len(test_out))
-            test_out = test_out[0]
-        rep_size = test_out.flatten(1, -1).shape[-1]
-
-        correct = torch.zeros((self.num_repeats, len(self.noise_stds)))
-        for batch_idx, (inputs, targets) in enumerate(self.sample_loader):
-            inputs, targets = (
-                inputs.to(self.device, dtype=torch.float),
-                targets.to(self.device),
-            )
-            self.batch_size = inputs.shape[0]
-            reps = torch.zeros(
-                (
-                    # self.batch_size,
-                    min(
-                        self.batch_size, self.num_samples - batch_idx * self.batch_size,
-                    ),
-                    self.num_repeats,
-                    len(self.noise_stds),
-                    rep_size,
-                )
-            )
-            for repeat in range(self.num_repeats):
-                for noise_idx, noise_std in enumerate(self.noise_stds):
-                    # apply noise
-                    trainer = copy.deepcopy(self.experiment.trainer)
-                    trainer.noise_std = {noise_std: 1.0}
-                    module = NoiseAugmentation(
-                        self.model,
-                        config=trainer,
-                        device=self.device,
-                        data_loader={"img_classification": self.sample_loader},
-                        seed=42,
-                    )
-                    inputs_ = inputs.clone()
-                    model, inputs_ = module.pre_forward(
-                        self.model, inputs_, {}, train_mode=False
-                    )
-                    # Forward
-                    outputs = model(inputs_)
-                    samples_start = 0
-                    samples_end = min(
-                        self.batch_size, self.num_samples - batch_idx * self.batch_size
-                    )
-                    # for rep_name in self.rep_names:
-                    rep = outputs[0][rep_name].flatten(1, -1).detach().cpu()
-                    reps[samples_start:samples_end, repeat, noise_idx] = rep.view(
-                        (self.batch_size, -1)
-                    )[: (samples_end - samples_start)]
-
-                    # track accuracy
-                    _, predicted = outputs[1].max(1)
-                    targets_ = targets[: (samples_end - samples_start)]
-                    predicted_ = predicted[: (samples_end - samples_start)]
-                    correct[repeat, noise_idx] += predicted_.eq(targets_).sum().item()
-            torch.save(
-                reps,
-                os.path.join(self.tmp_path, "reps_{}_{}".format(rep_name, batch_idx)),
-            )
-
-            if (batch_idx + 1) * self.batch_size >= self.num_samples:
-                break
-
-        self.accuracy = ((correct / self.num_samples) * 100).numpy()
-
-    def plot_noise_stability(self, stability):
-        df = pd.DataFrame(
-            {"Layer": range(1, len(stability) + 1), "Stability": stability}
-        )
-        fig, axs = plot_preparation(nrows=1, ncols=1, fraction=0.5, style="nips",)
-        g = sns.lineplot(x="Layer", y="Stability", data=df, ax=axs)
-        xlabels = [str(int(x)) for x in g.get_xticks()]
-        g.set_xticklabels(xlabels)
-        return fig
-
-    def plot_acc_over_noise(self, ax):
-        df = pd.DataFrame(self.accuracy)
-        df.astype(float)
-        df.columns = ["{:01.2f}".format(n) for n in self.noise_stds]
-        df = df.stack().reset_index()
-        df.columns = ["repeat", "noise", "Accuracy"]
-        sns.lineplot(x="noise", y="Accuracy", data=df, ax=ax)
-        ax.set_ylim(ymin=0, ymax=100)
-
-    def compute_stability_matrix(self, dist_measure, rep_name):
-        result = self.load_matrix(dist_measure, rep_name.replace(".", "_"))
-        if result is None:
-            rep_pieces = [
-                torch.load(
-                    os.path.join(
-                        self.tmp_path, "reps_{}_{}".format(rep_name, batch_idx)
-                    )
-                )
-                for batch_idx in range(self.num_batches)
-            ]
-            rep = torch.cat(rep_pieces)
-            result = torch.zeros((len(self.noise_stds), len(self.noise_stds)))
-            first_loop = (
-                range(self.num_repeats - 1) if self.num_repeats > 1 else range(1)
-            )
-            for r in first_loop:
-                second_loop = (
-                    range(r + 1, self.num_repeats) if self.num_repeats > 1 else range(1)
-                )
-                for r2 in second_loop:
-                    reps1 = rep[:, r].to(self.device)
-                    reps2 = rep[:, r2].to(self.device)
-                    for i in range(len(self.noise_stds)):
-                        for j in range(i, len(self.noise_stds)):
-                            res = similarity(reps1[:, i], reps2[:, j], dist_measure)
-                            result[i, j] += res.detach().cpu()
-                            if j != i:
-                                result[j, i] += res.detach().cpu()
-                            res = similarity(reps2[:, i], reps1[:, j], dist_measure)
-                            result[i, j] += res.detach().cpu()
-                            if j != i:
-                                result[j, i] += res.detach().cpu()
-                    del reps1
-                    del reps2
-            result = (
-                (
-                    result / (2 * (self.num_repeats * (self.num_repeats - 1) / 2))
-                    if self.num_repeats > 1
-                    else result
-                )
-                .detach()
-                .numpy()
-            )
-            self.save_matrix(result, dist_measure, rep_name.replace(".", "_"))
-        return result
diff --git a/bias_transfer/analysis/results/bias_transfer_benchmark.py b/bias_transfer/analysis/results/bias_transfer_benchmark.py
deleted file mode 100644
index 76d6bf6..0000000
--- a/bias_transfer/analysis/results/bias_transfer_benchmark.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-from .base import Analyzer
-from ..plot import plot
-
-
-class BiasTransferAnalyzer(Analyzer):
-    def generate_table(
-        self,
-        objective=("Test", "img_classification", "accuracy"),
-        last_n=0,
-        label_steps=False,
-    ):
-        row_list = []
-        for desc, results in self.data.items():
-            if label_steps:
-                name_split = desc.name.split(" ")
-                name = " ".join(name_split[:-1])
-                labels = name_split[-1][1:-1].split(";")
-            else:
-                name, labels = (desc.name, None)
-            row = {"name": name}
-            levels = sorted(list(results.keys()))
-            if last_n:
-                levels = levels[(-1) * last_n :]
-            for level, tracker in results.items():
-                try:
-                    if level in levels:
-                        l = levels.index(level)
-                        if labels:
-                            l = labels[l]
-                        row[l] = tracker.get_current_objective(*objective)
-                except:
-                    pass  # no valid entry for this objective
-            row_list.append(row)
-        df = pd.DataFrame(row_list)
-        if not df.empty:
-            df = df.groupby("name").first()
-            # Split off alpha from name
-            df = df.reset_index()
-            new = df["name"].str.split(":", n=1, expand=True)
-            if len(new.columns) > 1:
-                df.drop(columns=["name"], inplace=True)
-                df["name"] = new[0]
-                df["alpha"] = new[1]
-            df = df.set_index("name")
-        return df
-
-    def generate_normalized_table(self):
-        df = self.generate_table(last_n=2, label_steps=True)
-        for i, c in enumerate(df.columns):
-            offset = "A" if i % 2 == 0 else "B"
-            baseline = df.at[f"Direct Training {offset}", c]
-            df.insert(
-                2 * i + 1, c + " normalized", df[c].divide(baseline).multiply(100)
-            )
-        return df
-
-    @plot
-    def plot_frontier(
-        self, fig, ax, columns_range=(), title=False, hide_lines=False,
-    ):
-        df = self.generate_table(last_n=2, label_steps=True)
-        direct_a = (
-            df.loc["Direct Training on Target"]
-            if "Direct Training on Target" in df.index
-            else None
-        )
-        direct_b = (
-            df.loc["Direct Training on Eval"]
-            if "Direct Training on Eval" in df.index
-            else None
-        )
-        max_x, min_x, max_y, min_y = 0, 100, 0, 100
-        for i, c in enumerate(df.columns):
-            if not columns_range[0] <= i <= columns_range[1]:
-                continue
-            if i % 2 == 1:
-                if True:
-                    a = ax[i - 1 - columns_range[0]][i - 1 - columns_range[0]]
-                else:
-                    a = ax[(i - 1) // 4][((i - 1) % 4) // 2]
-                colors = [
-                    "#a6cee3",
-                    "#1f78b4",
-                    "#b2df8a",
-                    "#33a02c",
-                    "#fb9a99",
-                    "#e31a1c",
-                    "#fdbf6f",
-                    "#ff7f00",
-                    "#cab2d6",
-                    "#6a3d9a",
-                    "#ffff99",
-                ]
-                models = sorted(list(set(df.index)))
-                print(models)
-                colors = dict(zip(models, colors[: len(models)]))
-                print(colors)
-                plot_res = sns.lineplot(
-                    data=df,
-                    x=df.columns[i - 1],
-                    y=c,
-                    hue="name",
-                    ax=a,
-                    legend="brief",
-                    style="name",
-                    markers=True,
-                    palette=colors,
-                )
-                for line in plot_res.lines[2:]:
-                    line.set_visible(not hide_lines)
-                # if i == 5 and legend_outside:
-                #     a.legend(
-                #         fontsize=14,
-                #         title_fontsize="14",
-                #         bbox_to_anchor=(1.05, 1),
-                #         loc="upper left",
-                #         borderaxespad=0.0,
-                #     )
-                if direct_b is not None:
-                    a.axhline(
-                        y=direct_b[c], lw=0.7, color=colors["Direct Training on Eval"]
-                    )
-                if direct_a is not None:
-                    a.axvline(
-                        x=direct_a[df.columns[i - 1]],
-                        lw=0.7,
-                        color=colors["Direct Training on Target"],
-                    )
-                min_x = min(min_x, a.get_xlim()[0])
-                min_y = min(min_y, a.get_ylim()[0])
-                max_x = max(max_x, a.get_xlim()[1])
-                max_y = max(max_y, a.get_ylim()[1])
-                if title:
-                    a.set_title(self.name_map(a.get_xlabel()), fontweight="bold")
-                a.set_xlabel(
-                    self.name_map(a.get_xlabel().split("->")[1], "Target Task: ")
-                )
-                a.set_ylabel(self.name_map(a.get_ylabel(), "Evaluation: "))
-
-
-        for i in range(len(ax)):
-            for j in range(len(ax[i])):
-                # axs[i][j].set_xlim([min_x,max_x])
-                ax[i][j].set_ylim([min_y, max_y])
-
-        # sns.despine(offset=5, trim=False)
-        # plt.subplots_adjust(hspace=0.4)
-        # if "talk" in style:
-        #     if legend_outside:
-        #         pass
-        #         # ax.legend(
-        #         #     fontsize=14,
-        #         #     title_fontsize="14",
-        #         #     bbox_to_anchor=(1.05, 1),
-        #         #     loc="upper left",
-        #         #     borderaxespad=0.0,
-        #         # )
-        #     else:
-        #         plt.legend(fontsize=14, title_fontsize="14")
-        # elif legend_outside:
-        #     plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)
-        # if save:
-        #     save_plot(
-        #         fig,
-        #         save + "_" + style,
-        #         types=("png", "pdf", "pgf") if "nips" in style else ("png",),
-        #     )
diff --git a/bias_transfer/analysis/results/c_test.py b/bias_transfer/analysis/results/c_test.py
deleted file mode 100644
index cba5a3a..0000000
--- a/bias_transfer/analysis/results/c_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-import re
-
-from .base import Analyzer
-from ..plot import plot_preparation
-
-
-class CTestAnalyzer(Analyzer):
-    corruption_map = {
-        "shot_noise": "Shot Noise",
-        "impulse_noise": "Impulse Noise",
-        "speckle_noise": "Speckle Noise",
-        "gaussian_noise": "Gaussian Noise",
-        "defocus_blur": "Defocus Blur",
-        "gaussian_blur": "Gauss Blur",
-        "motion_blur": "Motion Blur",
-        "glass_blur": "Glass Blur",
-        "zoom_blur": "Zoom Blur",
-        "brightness": "Brightness",
-        "fog": "Fog",
-        "frost": "Frost",
-        "snow": "Snow",
-        "contrast": "Contrast",
-        "elastic_transform": "Elastic Transform",
-        "pixelate": "Pixelate",
-        "jpeg_compression": "JPEG Compression",
-        "saturate": "Saturate",
-        "spatter": "Spatter",
-    }
-
-    Res_Alex_Net_mean = dict()
-    Res_Alex_Net_mean["Gaussian Noise"] = 0.886
-    Res_Alex_Net_mean["Shot Noise"] = 0.894
-    Res_Alex_Net_mean["Impulse Noise"] = 0.923
-    Res_Alex_Net_mean["Defocus Blur"] = 0.820
-    Res_Alex_Net_mean["Gauss Blur"] = 0.826
-    Res_Alex_Net_mean["Glass Blur"] = 0.826
-    Res_Alex_Net_mean["Motion Blur"] = 0.786
-    Res_Alex_Net_mean["Zoom Blur"] = 0.798
-    Res_Alex_Net_mean["Snow"] = 0.867
-    Res_Alex_Net_mean["Frost"] = 0.827
-    Res_Alex_Net_mean["Fog"] = 0.819
-    Res_Alex_Net_mean["Brightness"] = 0.565
-    Res_Alex_Net_mean["Contrast"] = 0.853
-    Res_Alex_Net_mean["Elastic Transform"] = 0.646
-    Res_Alex_Net_mean["Pixelate"] = 0.718
-    Res_Alex_Net_mean["JPEG Compression"] = 0.607
-    Res_Alex_Net_mean["Speckle Noise"] = 0.845
-    Res_Alex_Net_mean["Spatter"] = 0.718
-    Res_Alex_Net_mean["Saturate"] = 0.658
-
-    def extract_c_test_results(self):
-        corruptions = (
-            "shot_noise",
-            "impulse_noise",
-            # "speckle_noise",
-            "gaussian_noise",
-            "defocus_blur",
-            # "gaussian_blur",
-            "motion_blur",
-            "glass_blur",
-            "zoom_blur",
-            "brightness",
-            "fog",
-            "frost",
-            "snow",
-            "contrast",
-            "elastic_transform",
-            "pixelate",
-            "jpeg_compression",
-            # "saturate",
-            # "spatter",
-        )
-        data_to_plot = pd.DataFrame()
-        for corruption in corruptions:
-            row_list = []
-            for desc, tracker in self.data.items():
-                row = {
-                    severity: tracker.get_current_objective(
-                        corruption, str(severity), "accuracy"
-                    )
-                    for severity in range(1, 6)
-                }
-                row[0] = tracker.get_current_objective(
-                    "Test", "img_classification", "accuracy"
-                )
-                row["name"] = desc.name
-                row_list.append(row)
-            df = pd.DataFrame(row_list)
-            df = df.groupby("name").mean()
-            df["Corruption"] = corruption
-            data_to_plot = pd.concat([data_to_plot, df], axis=0, sort=True)
-        return data_to_plot
-
-    def calculate_c_scores(self):
-        c_data = self.extract_c_test_results()
-        df = c_data[c_data.columns[0:6]].apply(lambda x: 100 - x)
-        df_mean = df[df.columns[0:6]].mean(axis=1)
-        c_data = pd.concat([c_data, df_mean], axis=1)
-        c_data.columns = [1, 2, 3, 4, 5, 0, "Corruption", "Mean"]
-
-        def normalize_alexnet(row):
-            mean_error = row["Mean"]
-            corruption = row["Corruption"]
-            ce = mean_error / self.Res_Alex_Net_mean[self.corruption_map[corruption]]
-            return pd.concat([row, pd.Series({"mCE": ce})])
-
-        c_data = c_data.apply(normalize_alexnet, axis=1)
-        c_data = c_data.groupby("name").mean()
-        return c_data
-
-    def plot_grid(self, style, **kwargs):
-        fig, ax = plot_preparation(style)
-        data_to_plot = self.extract_c_test_results()
-        g = sns.FacetGrid(
-            data=data_to_plot,
-            col="Corruption",
-            col_wrap=4,
-            sharey=True,
-            sharex=True,
-            # height=4
-        )
-
-        def draw_heatmap(data, *args, **kwargs):
-            del data["Corruption"]
-            # print(data)
-            sns.heatmap(data, annot=True, cbar=False)
-
-        g.map_dataframe(draw_heatmap)
-        fig = g.fig
-        sns.despine(offset=10, trim=True)
-        # remove ticks again (see: https://stackoverflow.com/questions/37860163/seaborn-despine-brings-back-the-ytick-labels)
-        # loop over the non-left axes:
-        for i, ax in enumerate(g.axes.flat):
-            if i % 4 != 0:
-                # get the yticklabels from the axis and set visibility to False
-                for label in ax.get_yticklabels():
-                    label.set_visible(False)
-                ax.yaxis.offsetText.set_visible(False)
-            if i < len(g.axes) - 4:
-                # get the xticklabels from the axis and set visibility to False
-                for label in ax.get_xticklabels():
-                    label.set_visible(False)
-                ax.xaxis.offsetText.set_visible(False)
-        self._post_plot_operations(style, **kwargs)
-
diff --git a/bias_transfer/analysis/results/regression.py b/bias_transfer/analysis/results/regression.py
deleted file mode 100644
index 43a5e13..0000000
--- a/bias_transfer/analysis/results/regression.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import torch
-
-from bias_transfer.gp.nn_kernel import nn_kernel
-import matplotlib.pyplot as plt
-
-
-class Analyzer:
-    def __init__(self):
-        self.data_loaders = {}
-        self.model = None
-        self.trainer = None
-
-    def load_model(self, config, table, transfer_level):
-        # Select data:
-        if transfer_level < len(config.get_restrictions()):
-            restricted = table & config.get_restrictions()[transfer_level]
-        else:
-            print("Nothing to load")
-            restricted = None
-        if restricted:  # could be empty if entry is not computed yet
-            self.data_loaders, self.model, self.trainer = restricted.load_model(
-                include_dataloader=True, include_trainer=True, include_state_dict=True
-            )
-
-    def plot_eval(self, save=""):
-        self.model.eval()
-        x_test, y_test = self.data_loaders["test"]["regression"].dataset.tensors
-        x_train, y_train = self.data_loaders["train"]["regression"].dataset.tensors
-        plt.plot(x_test, y_test, color="orange", lw=2, label="True")
-        plt.plot(x_train, y_train, color="red", label="Traning data")
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(device)
-        prediction = self.model(x_test.to(device))  # input x and predict based on x
-        if isinstance(prediction, tuple):
-            prediction = prediction[1]
-        plt.plot(x_test, prediction.detach().cpu().numpy(), label="Prediction")
-        plt.legend()
-        if save:
-            fig = plt.gcf()
-            fig.savefig(save, dpi=200)
-
-    def plot_kernel(self):
-        self.model.eval()
-        x_test, y_test = self.data_loaders["test"]["regression"].dataset.tensors
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(device)
-        K_plot = nn_kernel(x_test, x_test, net=self.model, device=device)
-        plt.imshow(K_plot)
-        # if np.count_nonzero(x) > 0:
-        #     _ = plt.xticks(np.arange(0,x.shape[0], 15),x[::15,0].astype(np.int))
-        #     _ = plt.yticks(np.arange(0,x.shape[0], 15),x[::15,0].astype(np.int))
-        plt.colorbar()
diff --git a/bias_transfer/analysis/train_path/base.py b/bias_transfer/analysis/train_path/base.py
deleted file mode 100644
index a023c8f..0000000
--- a/bias_transfer/analysis/train_path/base.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import tempfile
-
-import numpy as np
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-import torch
-
-from sklearn.decomposition import PCA
-from sklearn.manifold import TSNE
-from bias_transfer.analysis.plot import save_plot, plot
-from bias_transfer.tables.transfer import Checkpoint
-from neuralpredictors.tracking import AdvancedMultipleObjectiveTracker as Tracker
-
-
-class Analyzer:
-    def __init__(self):
-        self.data = {}
-
-    def load_data(self, configs):
-        # Select data:
-        with tempfile.TemporaryDirectory() as temp_dir:
-            for description, config in configs.items():
-                level = 0
-                while True:
-                    restriction = config.get_restrictions(level)
-                    if not restriction:
-                        break
-                    restricted = Checkpoint() & restriction
-                    if restricted:  # could be empty if entry is not computed yet
-                        fetch_res = restricted.fetch("state", "epoch", as_dict=True, download_path=temp_dir)
-                        if description not in self.data:
-                            self.data[description] = {}
-                        for res in fetch_res:
-                            data = self.data.get(description, {}).get(level, {})
-                            data[res["epoch"]] = torch.load(res["state"])["net"]
-                            self.data[description][level] = data
-                    level += 1
-
-    def _compute_pca(self, tensors):
-        pca = PCA(n_components=2)
-        pca.fit(tensors)
-        pca_result = pca.transform(tensors)
-        print(
-            "Explained variation per principal component: {}".format(
-                pca.explained_variance_ratio_
-            ),
-            flush=True,
-        )
-        return pca_result
-
-    def _compute_tsne(self, tensors):
-        tsne = TSNE(
-            n_components=2, verbose=1, perplexity=40, n_iter=250, init="pca"
-        )
-        return tsne.fit_transform(tensors)
-
-
-    def _flatten_state_dict(self, state):
-        parameters = []
-        for param in state.values():
-            parameters.append(torch.flatten(param).cpu().numpy())
-        return np.concatenate(parameters)
-
-    @plot
-    def plot_paths(self, fig, ax, level=0, method="pca"):
-        parameters = []
-        labels = []
-        for descr, states in self.data.items():
-            states = states[level]
-            for epoch, state in states.items():
-                parameters.append(self._flatten_state_dict(state))
-                labels.append(f"{descr.name}, Seed {descr.seed}")
-        parameters = np.stack(parameters)
-        if method == "tsne":
-            result = self._compute_tsne(parameters)
-        else:
-            result = self._compute_pca(parameters)
-        sns.scatterplot(x=result[:,0], y=result[:,1], hue=labels, ax=ax)
-
diff --git a/bias_transfer/configs/dataset/__init__.py b/bias_transfer/configs/dataset/__init__.py
deleted file mode 100644
index cf6ff9c..0000000
--- a/bias_transfer/configs/dataset/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .mnist import MNIST
-from .mnist_ib import MNIST_IB
-from .imagenet import ImageNet
-from .tiny_imagenet import TinyImageNet
-from bias_transfer.configs.dataset.mixins.transfer import Generated
-from .regression import Regression
-from .image import ImageDatasetConfig
-from .base import DatasetConfig
-from .mtl import MTLDatasetsConfig
\ No newline at end of file
diff --git a/bias_transfer/configs/dataset/image.py b/bias_transfer/configs/dataset/image.py
deleted file mode 100644
index 83228d4..0000000
--- a/bias_transfer/configs/dataset/image.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from typing import Dict, Tuple
-
-from bias_transfer.configs.dataset.base import DatasetConfig
-from bias_transfer.tables.nnfabrik import Dataset
-
-
-class ImageDatasetConfig(DatasetConfig):
-    config_name = "dataset"
-    table = Dataset()
-    fn = "bias_transfer.dataset.img_dataset_loader"
-
-    data_mean_defaults = {
-        "CIFAR100": (0.5070751592371323, 0.48654887331495095, 0.4409178433670343,),
-        "CIFAR10": (0.49139968, 0.48215841, 0.44653091),
-        "SVHN": (0.4377, 0.4438, 0.4728),
-        "TinyImageNet_bw": (0.4519,),
-        "TinyImageNet": (0.4802, 0.4481, 0.3975,),
-        "ImageNet": (0.485, 0.456, 0.406),
-        "MNIST": (0.1307,),
-        "MNIST_color": (0.03685451, 0.0367535, 0.03952756),
-        "MNIST_color_easy": (0.03685451, 0.0367535, 0.03952756),
-        "MNIST_noise": (0.13405791,),
-        "MNIST_rotation": (0.0640235,),
-        "MNIST_translation": (0.06402363,),
-        "MNIST_addition": (0.06402363,),
-        "MNIST_clean": (0.06402363,),
-        "MNIST_clean_shuffle": (0.06402363,),
-        "FashionMNIST_color": (0.08239705, 0.09176614, 0.0904255,),
-        "FashionMNIST_color_shuffle": (0.08239705, 0.09176614, 0.0904255,),
-        "FashionMNIST_color_easy": (0.08239705, 0.09176614, 0.0904255,),
-        "FashionMNIST_noise": (0.19938468,),
-        "FashionMNIST_rotation": (0.14016011,),
-        "FashionMNIST_rotation_regression": (0.14016011,),
-        "FashionMNIST_translation": (0.1401599,),
-        "FashionMNIST_addition": (0.1401599,),
-        "FashionMNIST_clean": (0.1401599,),
-        "FashionMNIST_clean_shuffle": (0.1401599,),
-    }
-    data_std_defaults = {
-        "CIFAR100": (0.2673342858792401, 0.2564384629170883, 0.27615047132568404,),
-        "CIFAR10": (0.24703223, 0.24348513, 0.26158784),
-        "SVHN": (0.1980, 0.2010, 0.1970),
-        "TinyImageNet_bw": (0.2221,),
-        "TinyImageNet": (0.2302, 0.2265, 0.2262,),
-        "ImageNet": (0.229, 0.224, 0.225),
-        "MNIST": (0.3081,),
-        "MNIST_color": (0.17386045, 0.16883257, 0.1768625),
-        "MNIST_color_easy": (0.17386045, 0.16883257, 0.1768625),
-        "MNIST_noise": (0.22387815,),
-        "MNIST_rotation": (0.0640235,),
-        "MNIST_translation": (0.22534915,),
-        "MNIST_addition": (0.22534915,),
-        "MNIST_clean": (0.22534915,),
-        "MNIST_clean_shuffle": (0.22534915,),
-        "FashionMNIST_color": (0.25112887, 0.26145387, 0.26009334,),
-        "FashionMNIST_color_shuffle": (0.25112887, 0.26145387, 0.26009334,),
-        "FashionMNIST_color_easy": (0.25112887, 0.26145387, 0.26009334,),
-        "FashionMNIST_noise": (0.28845804,),
-        "FashionMNIST_rotation": (0.28369352,),
-        "FashionMNIST_rotation_regression": (0.28369352,),
-        "FashionMNIST_translation": (0.28550556,),
-        "FashionMNIST_addition": (0.28550556,),
-        "FashionMNIST_clean": (0.28550556,),
-        "FashionMNIST_clean_shuffle": (0.28550556,),
-    }
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-
-        self.dataset_cls: str = "CIFAR10"
-        self.apply_augmentation: bool = True
-        self.apply_normalization: bool = True
-        self.apply_grayscale: bool = False
-        self.apply_noise: Dict = {}
-        self.convert_to_rgb: bool = False
-        self.input_size: int = 32
-        self.add_corrupted_test: bool = False
-        self.add_stylized_test: bool = False
-        self.use_c_test_as_val: bool = False
-        self.show_sample: bool = False
-        self.filter_classes: Tuple = ()  # (start,end)
-        self.data_dir: str = "./data/image_classification/torchvision/"
-        self.num_workers: int = 1
-        dataset_id = (
-            f"{self.dataset_sub_cls}_{self.bias}" if self.bias else self.dataset_cls
-        )
-        dataset_id += "_bw" if self.apply_grayscale else ""
-        self.train_data_mean: Tuple[float] = self.data_mean_defaults[dataset_id]
-        self.train_data_std: Tuple[float] = self.data_std_defaults[dataset_id]
-
-        super().__init__(**kwargs)
-
-    @property
-    def filters(self):
-        filters = []
-        if self.filter_classes:
-            filters.append("ClassesFilter")
-        return filters
\ No newline at end of file
diff --git a/bias_transfer/configs/dataset/imagenet.py b/bias_transfer/configs/dataset/imagenet.py
deleted file mode 100644
index e2aef11..0000000
--- a/bias_transfer/configs/dataset/imagenet.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from bias_transfer.configs.dataset.image import ImageDatasetConfig
-
-
-class ImageNet(ImageDatasetConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.dataset_cls: str = "ImageNet"
-        self.data_dir: str = "./data/image_classification/"
-        self.input_size: int = 224
-        self.num_workers: int = 8
-        self.valid_size: float = 0.0416  # To get ~50K (test set size)
-
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/dataset/mixins/__init__.py b/bias_transfer/configs/dataset/mixins/__init__.py
deleted file mode 100644
index 03019f9..0000000
--- a/bias_transfer/configs/dataset/mixins/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .transfer import Generated
\ No newline at end of file
diff --git a/bias_transfer/configs/dataset/mixins/transfer.py b/bias_transfer/configs/dataset/mixins/transfer.py
deleted file mode 100644
index aae3bfe..0000000
--- a/bias_transfer/configs/dataset/mixins/transfer.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from bias_transfer.configs.base import BaseConfig
-from bias_transfer.tables.nnfabrik import Dataset
-
-
-class Generated(BaseConfig):
-    config_name = "dataset"
-    table = Dataset()
-    fn = "bias_transfer.dataset.transferred_dataset_loader"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.train_on_reduced_data: bool = False
-        self.train_on_coreset: bool = False
-        self.load_coreset: bool = False
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/dataset/mnist.py b/bias_transfer/configs/dataset/mnist.py
deleted file mode 100644
index e18c404..0000000
--- a/bias_transfer/configs/dataset/mnist.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from bias_transfer.configs.dataset.image import ImageDatasetConfig
-
-
-class MNIST(ImageDatasetConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.dataset_cls: str = "MNIST"
-        self.input_size: int = 28
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/dataset/mnist_ib.py b/bias_transfer/configs/dataset/mnist_ib.py
deleted file mode 100644
index c67caca..0000000
--- a/bias_transfer/configs/dataset/mnist_ib.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from bias_transfer.configs.dataset.image import ImageDatasetConfig
-
-
-class MNIST_IB(ImageDatasetConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.dataset_cls = "MNIST-IB"
-        self.input_size: int = 40 if self.bias != "addition" else 80
-        self.convert_to_rgb: bool = False
-        self.bias: str = "clean"
-        self.dataset_sub_cls: str = "FashionMNIST"  # could also be MNIST
-        self.apply_data_normalization: bool = False
-        self.apply_data_augmentation: bool = False
-        self.add_corrupted_test: bool = False
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/dataset/mtl.py b/bias_transfer/configs/dataset/mtl.py
deleted file mode 100644
index 98a3331..0000000
--- a/bias_transfer/configs/dataset/mtl.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from typing import Dict
-
-from bias_transfer.configs.dataset.base import DatasetConfig
-from bias_transfer.tables.nnfabrik import Dataset
-
-
-class MTLDatasetsConfig(DatasetConfig):
-    config_name = "dataset"
-    table = Dataset()
-    fn = "bias_transfer.dataset.mtl_datasets_loader"
-
-    def __init__(self, sub_configs, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.sub_configs = sub_configs
-        super().__init__(**kwargs)
-
-        # super().__init__(**kwargs)
-        # self.neural_dataset_dict = kwargs.pop("neural_dataset_dict", {})
-        # self.neural_dataset_config = NeuralDatasetConfig(
-        #     **self.neural_dataset_dict
-        # ).to_dict()
-        # self.img_dataset_dict = kwargs.pop("img_dataset_dict", {})
-        # self.img_dataset_config = ImageDatasetConfig(**self.img_dataset_dict).to_dict()
-        #
-        # self.update(**kwargs)
-
-    def items(self):
-        return self.sub_configs.items()
-
-    def values(self):
-        return self.sub_configs.values()
-
-    def keys(self):
-        return self.sub_configs.keys()
-
-    def __getitem__(self, item):
-        return self.sub_configs[item]
-
-    @classmethod
-    def from_dict(cls, config_dict: Dict) -> "MTLDatasetsConfig":
-        """
-        Constructs a `Config` from a Python dictionary of parameters.
-
-        Args:
-            config_dict (:obj:`Dict[str, any]`):
-                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
-                from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
-                method.
-        Returns:
-            :class:`MTLDatasetConfig`: An instance of a configuration object
-        """
-        sub_configs = {}
-        for name, conf in config_dict.items():
-            dataset_cls = next(iter(conf.keys()))
-            sub_configs[name] = globals()[dataset_cls].from_dict(conf[dataset_cls])
-        return cls(sub_configs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = {}
-        for name, conf in self.sub_configs.items():
-            output[name] = {conf.__class__.__name__: conf.to_dict()}
-        return output
\ No newline at end of file
diff --git a/bias_transfer/configs/dataset/neural.py b/bias_transfer/configs/dataset/neural.py
deleted file mode 100644
index e1e0cf6..0000000
--- a/bias_transfer/configs/dataset/neural.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from bias_transfer.configs.dataset.base import DatasetConfig
-from bias_transfer.tables.nnfabrik import Dataset
-
-
-class NeuralDatasetConfig(DatasetConfig):
-    config_name = "dataset"
-    table = Dataset()
-    fn = "bias_transfer.dataset.neural_dataset_loader"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.train_frac = 0.8
-        self.dataset = "CSRF19_V1"
-        self.data_dir = "./data/monkey/toliaslab/{}".format(self.dataset)
-        self.seed = 1000
-        self.subsample = 1
-        self.crop = 70
-        self.time_bins_sum = 12
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/dataset/regression.py b/bias_transfer/configs/dataset/regression.py
deleted file mode 100644
index b94265d..0000000
--- a/bias_transfer/configs/dataset/regression.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from bias_transfer.configs.dataset.base import DatasetConfig
-from bias_transfer.tables.nnfabrik import Dataset
-
-
-class Regression(DatasetConfig):
-    config_name = "dataset"
-    table = Dataset()
-    fn = "bias_transfer.dataset.regression_dataset_loader"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.dataset_cls: str = "co2"
-        self.apply_normalization: bool = False
-        self.apply_noise: bool = False
-        self.input_size: int = 32
-        self.num_workers: int = 0
-        self.train_range: int = 10
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/dataset/tiny_imagenet.py b/bias_transfer/configs/dataset/tiny_imagenet.py
deleted file mode 100644
index 061c1e9..0000000
--- a/bias_transfer/configs/dataset/tiny_imagenet.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from bias_transfer.configs.dataset.image import ImageDatasetConfig
-
-
-class TinyImageNet(ImageDatasetConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.dataset_cls: str = "TinyImageNet"
-        self.data_dir: str = "./data/image_classification/"
-        self.input_size: int = 64
-        self.num_workers: int = 2
-        self.valid_size: int = 0.1
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/model/__init__.py b/bias_transfer/configs/model/__init__.py
deleted file mode 100644
index 9ce5c8f..0000000
--- a/bias_transfer/configs/model/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .regression import Regression
-from .classification import Classification
-from .imagenet import ImageNet, TinyImageNet
-from .cifar import CIFAR10, CIFAR100
-from .mnist import MNIST, MNISTIB
-from .svhn import SVHN
-from .mtl import MTL
diff --git a/bias_transfer/configs/model/cifar.py b/bias_transfer/configs/model/cifar.py
deleted file mode 100644
index 1d1a802..0000000
--- a/bias_transfer/configs/model/cifar.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from bias_transfer.configs.model.classification import Classification
-
-
-class CIFAR100(Classification):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.input_channels: int = 3
-        self.input_size: int = 32
-        self.num_classes: int = 100
-        super().__init__(**kwargs)
-
-
-class CIFAR10(Classification):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.input_channels: int = 3
-        self.input_size: int = 32
-        self.num_classes: int = 10
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/model/classification.py b/bias_transfer/configs/model/classification.py
deleted file mode 100644
index 6769324..0000000
--- a/bias_transfer/configs/model/classification.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Dict, Tuple
-
-from .base import ModelConfig
-
-
-class Classification(ModelConfig):
-    fn = "bias_transfer.models.classification_model_builder"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.type: str = "resnet50"
-        self.conv_stem_kernel_size: int = 3
-        self.conv_stem_padding: int = 1
-        self.conv_stem_stride: int = 1
-        self.core_stride: int = 1
-        self.max_pool_after_stem: bool = False
-        self.advanced_init: bool = False
-        self.zero_init_residual: bool = False
-        self.adaptive_pooling: bool = False
-        self.avg_pool: bool = False
-
-        # resnet specific
-        self.noise_adv_classification: bool = False
-        self.noise_adv_regression: bool = False
-        self.num_noise_readout_layers: int = 1
-        self.noise_sigmoid_output: bool = self.noise_adv_classification
-        # vgg specific
-        self.pretrained: bool = False
-        self.pretrained_path: str = ""
-        self.readout_type: str = "dense"
-        self.add_buffer: Tuple = ()
-        super().__init__(**kwargs)
\ No newline at end of file
diff --git a/bias_transfer/configs/model/imagenet.py b/bias_transfer/configs/model/imagenet.py
deleted file mode 100644
index 80e890b..0000000
--- a/bias_transfer/configs/model/imagenet.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from bias_transfer.configs.model.classification import Classification
-
-
-class ImageNet(Classification):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.num_classes: int = 1000
-        self.input_size: int = 224
-        self.input_channels: int = 3
-        self.conv_stem_kernel_size: int = 7
-        self.conv_stem_padding: int = 3
-        self.conv_stem_stride: int = 2
-        self.max_pool_after_stem: bool = True
-        self.advanced_init: bool = True
-        self.zero_init_residual: bool = True
-        self.adaptive_pooling: bool = True
-        self.avg_pool: bool = True
-        super().__init__(**kwargs)
-
-
-class TinyImageNet(Classification):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.num_classes = 200
-        self.input_size = 64
-        self.input_channels: int = 3
-        self.core_stride = 2
-        self.conv_stem_kernel_size = 5
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/model/mnist.py b/bias_transfer/configs/model/mnist.py
deleted file mode 100644
index cd8304c..0000000
--- a/bias_transfer/configs/model/mnist.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from bias_transfer.configs.model.classification import Classification
-
-
-class MNIST(Classification):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.type: str = "lenet5"
-        self.num_classes: int = 10
-        self.input_size: int = 28
-        self.input_channels: int = 1
-        self.comment = f"MNIST {self.type}"
-        super().__init__(**kwargs)
-
-
-class MNISTIB(MNIST):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.bias: str = "clean"
-        self.type: str = "lenet300-100" if self.bias == "translation" else "lenet5"
-        self.num_classes: int = 1 if "regression" in self.bias else 10
-        self.input_size: int = 80 if self.bias == "addition" else 40
-        self.input_channels: int = 3 if "color" in self.bias else 1
-        self.comment = f"MNIST-IB {self.bias} {self.type}"
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/model/mtl.py b/bias_transfer/configs/model/mtl.py
deleted file mode 100644
index d6616bc..0000000
--- a/bias_transfer/configs/model/mtl.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from .base import ModelConfig
-
-
-class MTL(ModelConfig):
-    fn = "bias_transfer.models.mtl_builder"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.vgg_type = kwargs.pop("vgg_type", "vgg19_bn")
-        self.classification = kwargs.pop("classification", False)
-        self.classification_readout_type = kwargs.pop(
-            "classification_readout_type", None
-        )
-        self.input_size = kwargs.pop("input_size", None)
-        self.num_classes = kwargs.pop("num_classes", 200)
-        self.pretrained = kwargs.pop("pretrained", True)
-
-        self.v1_model_layer = kwargs.pop("v1_model_layer", 17)
-        self.neural_input_channels = kwargs.pop("neural_input_channels", 1)
-        self.v1_fine_tune = kwargs.pop("v1_fine_tune", False)
-        self.v1_init_mu_range = kwargs.pop("v1_init_mu_range", 0.3)
-        self.v1_init_sigma_range = kwargs.pop("v1_init_sigma_range", 0.6)
-        self.v1_readout_bias = kwargs.pop("v1_readout_bias", True)
-        self.v1_bias = kwargs.pop("v1_bias", True)
-        self.v1_final_batchnorm = kwargs.pop("v1_final_batchnorm", False)
-        self.v1_gamma_readout = kwargs.pop("v1_gamma_readout", 0.5)
-        self.v1_elu_offset = kwargs.pop("v1_elu_offset", -1)
-        self.classification_input_channels = kwargs.pop(
-            "classification_input_channels", 1
-        )
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/model/neural.py b/bias_transfer/configs/model/neural.py
deleted file mode 100644
index f19cf9e..0000000
--- a/bias_transfer/configs/model/neural.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from .base import ModelConfig
-
-
-class Neural(ModelConfig):
-    fn = "bias_transfer.models.neural_cnn_builder"
-
-    @baseline
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.readout_type = kwargs.pop("readout_type", "point")
-        if self.readout_type == "point":
-            self.hidden_dilation = kwargs.pop("hidden_dilation", 2)
-            self.se_reduction = kwargs.pop("se_reduction", 16)
-        self.input_kern = kwargs.pop("input_kern", 24)
-        self.hidden_kern = kwargs.pop("hidden_kern", 9)
-        self.depth_separable = kwargs.pop("depth_separable", True)
-        self.stack = kwargs.pop("stack", -1)
-        self.n_se_blocks = kwargs.pop("n_se_blocks", 2)
-        self.gamma_readout = kwargs.pop("gamma_readout", 0.5)
-        self.gamma_input = kwargs.pop("gamma_input", 10)
\ No newline at end of file
diff --git a/bias_transfer/configs/model/regression.py b/bias_transfer/configs/model/regression.py
deleted file mode 100644
index 77ca298..0000000
--- a/bias_transfer/configs/model/regression.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from .base import ModelConfig
-
-
-class Regression(ModelConfig):
-    fn = "bias_transfer.models.regression_model_builder"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.type: str = "fc"
-        self.input_size: int = 1
-        self.output_size: int = 1
-        self.layer_size: int = 100
-        self.num_layers: int = 4
-        self.activation: str = "sigmoid"
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/model/svhn.py b/bias_transfer/configs/model/svhn.py
deleted file mode 100644
index 3aa3df2..0000000
--- a/bias_transfer/configs/model/svhn.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from bias_transfer.configs.model.classification import Classification
-
-
-class SVHN(Classification):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-        self.input_size: int = 32
-        self.num_classes: int = 10
-        self.input_channels: int = 3
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/trainer/__init__.py b/bias_transfer/configs/trainer/__init__.py
deleted file mode 100644
index 41ca10c..0000000
--- a/bias_transfer/configs/trainer/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .regression import Regression
-from .classification import Classification
-from .base import TrainerConfig
-from . import mixins
\ No newline at end of file
diff --git a/bias_transfer/configs/trainer/classification.py b/bias_transfer/configs/trainer/classification.py
deleted file mode 100644
index 28da989..0000000
--- a/bias_transfer/configs/trainer/classification.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from bias_transfer.configs.trainer.base import TrainerConfig
-
-
-class Classification(TrainerConfig):
-    fn = "bias_transfer.trainer.img_classification"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-
-        self.maximize: bool = True  # if stop_function maximized or minimized
-        self.eval_with_bn_train: bool = False
-
-        super(Classification, self).__init__(**kwargs)
-
-
diff --git a/bias_transfer/configs/trainer/mixins/__init__.py b/bias_transfer/configs/trainer/mixins/__init__.py
deleted file mode 100644
index d916310..0000000
--- a/bias_transfer/configs/trainer/mixins/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .lottery_ticket import LotteryTicketMixin
-from .transfer import DataGenerationMixin, TransferMixin
-from .noise import (
-    NoiseAdversarialMixin,
-    RepresentationMatchingMixin,
-    RepresentationMonitorMixin,
-    NoiseAugmentationMixin,
-)
diff --git a/bias_transfer/configs/trainer/mixins/lottery_ticket.py b/bias_transfer/configs/trainer/mixins/lottery_ticket.py
deleted file mode 100644
index c7b01f5..0000000
--- a/bias_transfer/configs/trainer/mixins/lottery_ticket.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import Dict
-
-from bias_transfer.configs.base import BaseConfig
-
-
-class LotteryTicketMixin(BaseConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-
-        self.lottery_ticket: Dict = {}
-        if self.lottery_ticket:
-            self.max_iter = self.lottery_ticket.get(
-                "rounds", 1
-            ) * self.lottery_ticket.get("round_length", 100)
-            self.main_loop_modules.append("LotteryTicketPruning")
-
-        super().__init__(**kwargs)
diff --git a/bias_transfer/configs/trainer/mixins/transfer.py b/bias_transfer/configs/trainer/mixins/transfer.py
deleted file mode 100644
index 5c5d17c..0000000
--- a/bias_transfer/configs/trainer/mixins/transfer.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from typing import Dict, Tuple
-
-from bias_transfer.configs.base import BaseConfig
-
-
-class DataGenerationMixin(BaseConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-
-        self.data_transfer: bool = False
-        self.save_input: bool = False
-        self.save_representation: bool = False
-        self.compute_fisher: Dict = {
-            "DEFAULT EMPTY": True,  # will turn into an empty dict
-            "num_samples": 1024,
-            "empirical": True,
-        }
-        self.compute_si_omega: Dict = {
-            "DEFAULT EMPTY": True,  # will turn into an empty dict
-            "damping_factor": 0.0001,
-        }
-        self.compute_covariance: bool = False
-        self.extract_coreset: Dict = {}
-        self.reset_for_new_task: bool = False
-
-        super().__init__(**kwargs)
-
-
-class TransferMixin(BaseConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-
-        self.data_transfer: bool = False
-        self.scale_loss_with_arctanh: bool = False
-        self.synaptic_intelligence_computation: bool = False
-        self.freeze = None
-        self.freeze_bn: bool = False
-        self.transfer_restriction: Tuple = ()
-        self.transfer_after_train: bool = False
-        self.single_input_stream: bool = True
-        self.readout_name: str = "fc"
-        self.reset: Tuple = ()
-        self.reset_linear_frequency = None
-        self.regularization: Dict = {
-            "DEFAULT EMPTY": True,  # will turn into an empty dict
-            "regularizer": "L2SP/Mixup/RDL/KnowledgeDistillation",
-            "alpha": 1.0,
-            "decay_alpha": True,
-        }
-
-        super().__init__(**kwargs)
-
-    def conditional_assignment(self):
-        if (
-            self.reset_linear_frequency
-            and not "RandomReadoutReset" in self.main_loop_modules
-        ):
-            self.main_loop_modules.append("RandomReadoutReset")
-        if (
-            self.synaptic_intelligence_computation
-            and not "SynapticIntelligence" in self.main_loop_modules
-        ):
-            self.main_loop_modules.append("SynapticIntelligence")
-        if (
-            self.regularization
-            and not self.regularization["regularizer"] in self.main_loop_modules
-        ):
-            self.main_loop_modules.append(self.regularization["regularizer"])
-        super().conditional_assignment()
diff --git a/bias_transfer/configs/trainer/regression.py b/bias_transfer/configs/trainer/regression.py
deleted file mode 100644
index e9480ab..0000000
--- a/bias_transfer/configs/trainer/regression.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from typing import Dict
-
-from bias_transfer.configs.trainer.base import TrainerConfig
-from bias_transfer.tables.nnfabrik import Trainer
-
-
-class Regression(TrainerConfig):
-    config_name = "trainer"
-    table = Trainer()
-    fn = "bias_transfer.trainer.regression"
-
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-
-        self.loss_functions: Dict = {"regression": "MSELoss"}
-        self.maximize: bool = False
-        self.noise_test: Dict = {}
-        self.apply_noise_to_validation: bool = False
-        self.show_epoch_progress: bool = False
-
-        super().__init__(**kwargs)
diff --git a/bias_transfer/dataset/MNIST_IB/__init__.py b/bias_transfer/dataset/MNIST_IB/__init__.py
deleted file mode 100644
index 08f6d95..0000000
--- a/bias_transfer/dataset/MNIST_IB/__init__.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import os
-from pathlib import Path
-
-import torch
-import numpy as np
-
-from torchvision.datasets import MNIST, FashionMNIST, EMNIST, KMNIST, QMNIST
-from torchvision import transforms
-
-from nnfabrik.utility.nn_helpers import set_random_seed
-from .addition import apply_additon
-from .expansion import apply_expansion
-from .noise import apply_gaussian_noise
-from .color import apply_color, get_color_codes
-from .translation import apply_translation
-from .rotation import apply_rotation
-from .shuffle import apply_label_shuffle
-
-
-def generate_dataset(data_loader, transform_fs=(), options=()):
-    new_ds_source = []
-    new_ds_target = []
-    for source, target in data_loader:
-        source = source.detach().numpy()
-        target = target.detach().numpy()
-        for t, transform_f in enumerate(transform_fs):
-            if transform_f is None:
-                continue
-            source, target = transform_f(source, target, **options[t])
-        new_ds_source.append(source)
-        new_ds_target.append(target)
-    new_ds_source = np.concatenate(new_ds_source)
-    new_ds_target = np.concatenate(new_ds_target)
-    return new_ds_source, new_ds_target
-
-
-bias_dict = {
-    "color": (
-        apply_color,
-        {
-            "cfg_means": get_color_codes(),
-            "cbg_means": get_color_codes(),
-            "bg": False,
-            "fg": True,
-            "color_variance": 0.02,
-        },
-    ),
-    "color_easy": (
-        apply_color,
-        {
-            "cfg_means": get_color_codes(),
-            "cbg_means": get_color_codes(),
-            "bg": False,
-            "fg": True,
-            "color_variance": 0.00,
-        },
-    ),
-    "color_shuffle": (
-        apply_color,
-        {
-            "cfg_means": get_color_codes(),
-            "cbg_means": get_color_codes(),
-            "bg": False,
-            "fg": True,
-            "color_variance": 0.02,
-            "shuffle": True,
-        },
-    ),
-    "noise": (apply_gaussian_noise, {"severity": -1}),  # random
-    "translation": (apply_translation, {"std": 5}),
-    "rotation": (apply_rotation, {}),
-    "rotation_regression": (apply_rotation, {"regression": True}),
-    "addition": (apply_additon, {}),
-    "clean": (None, {}),
-    "clean_shuffle": (apply_label_shuffle, {}),
-}
-
-
-def generate_and_save(
-    bias: str,
-    base_path: str = "/work/data/image_classification/torchvision/",
-    bias_options_: dict = None,
-    dataset: str = "MNIST",
-):
-    set_random_seed(42)
-    write_path = os.path.join(base_path, f"{dataset}-IB")
-    Path(write_path).mkdir(parents=True, exist_ok=True)
-    if (
-        os.path.isfile(os.path.join(write_path, f"{bias}_train_source.npy"))
-        and os.path.isfile(os.path.join(write_path, f"{bias}_train_target.npy"))
-        and os.path.isfile(os.path.join(write_path, f"{bias}_test_source.npy"))
-        and os.path.isfile(os.path.join(write_path, f"{bias}_test_target.npy"))
-    ):
-        return
-    apply_bias, bias_options = bias_dict[bias]
-    bias_options = bias_options_ if bias_options_ is not None else bias_options
-    transform = transforms.Compose([transforms.ToTensor(),])
-    train = globals().get(dataset)(
-        root=base_path, train=True, download=True, transform=transform,
-    )
-    test = globals().get(dataset)(
-        root=base_path, train=False, download=True, transform=transform,
-    )
-    train_loader = torch.utils.data.DataLoader(
-        train, batch_size=64, shuffle=False,
-    )
-    test_loader = torch.utils.data.DataLoader(
-        test, batch_size=64, shuffle=False,
-    )
-    train_ds = generate_dataset(
-        data_loader=train_loader,
-        transform_fs=(apply_expansion, apply_bias),
-        options=({}, bias_options),
-    )
-    test_ds = generate_dataset(
-        data_loader=test_loader,
-        transform_fs=(apply_expansion, apply_bias),
-        options=({}, bias_options),
-    )
-    np.save(os.path.join(write_path, f"{bias}_train_source.npy"), train_ds[0])
-    np.save(os.path.join(write_path, f"{bias}_train_target.npy"), train_ds[1])
-    np.save(os.path.join(write_path, f"{bias}_test_source.npy"), test_ds[0])
-    np.save(os.path.join(write_path, f"{bias}_test_target.npy"), test_ds[1])
diff --git a/bias_transfer/dataset/MNIST_IB/addition.py b/bias_transfer/dataset/MNIST_IB/addition.py
deleted file mode 100644
index ffa6c86..0000000
--- a/bias_transfer/dataset/MNIST_IB/addition.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import numpy as np
-
-
-def apply_additon(source, target):
-    second_summand = np.arange(source.shape[0])
-    np.random.shuffle(second_summand)
-    concat_source = np.concatenate([source, source[second_summand]], axis=3)
-    summed_targets = target + target[second_summand]
-    return concat_source, summed_targets
\ No newline at end of file
diff --git a/bias_transfer/dataset/MNIST_IB/color.py b/bias_transfer/dataset/MNIST_IB/color.py
deleted file mode 100644
index 9c8b698..0000000
--- a/bias_transfer/dataset/MNIST_IB/color.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import numpy as np
-
-
-# code adapted from https://github.com/salesforce/corr_based_prediction/blob/master/gen_color_mnist.py
-# procedure following https://arxiv.org/pdf/1812.10352.pdf
-# they variaed color_variance between 0.05 and 0.02 (in 0.005 steps)
-
-class_color_means = [
-    [60, 180, 75],  # green
-    [255, 255, 25],  # yellow
-    [0, 130, 200],  # blue
-    [245, 130, 48],  # orange
-    [70, 240, 240],  # cyan
-    [240, 50, 230],  # magenta
-    [230, 25, 75],  # red
-    [0, 0, 128],  # navy
-    [220, 190, 255],  # lavender
-    [255, 250, 200],  # beige
-]
-nb_classes = 10
-
-
-def get_color_codes():
-    #     C = np.random.rand(nb_classes,3)
-    C = np.asarray(class_color_means)
-    C = C / np.max(C, axis=1)[:, None]
-    return C
-
-
-def get_std_color(means, targets, var):
-    mean = means[targets].reshape((-1))
-    cov = var * np.eye(mean.shape[0])
-    c = np.random.multivariate_normal(mean=mean, cov=cov)
-    c = c.reshape(targets.shape[0], 3, 1, 1)
-    return c
-
-
-def apply_color(
-    x,
-    targets,
-    cfg_means=None,
-    cbg_means=None,
-    fg=True,
-    bg=False,
-    color_variance=0.0,
-    shuffle=False,
-):
-    assert (
-        len(x.shape) == 4
-    ), "Something is wrong, size of input x should be 4 dimensional (B x C x H x W; perhaps number of channels is degenrate? If so, it should be 1)"
-    xs = x.shape
-    x = (((x * 255) > 10) * 255).astype(np.float)  # thresholding to separate fg and bg
-    x_rgb = np.ones((xs[0], 3, xs[2], xs[3])).astype(np.float)
-    x_rgb = x_rgb * x
-    targets_ = np.copy(targets)
-    if shuffle:
-        np.random.shuffle(targets)  # to generate cue-conflict by assigning wrong colors
-    if fg:
-        x_rgb_fg = 1.0 * x_rgb
-        x_rgb_fg *= get_std_color(cfg_means, targets, color_variance)
-    else:
-        x_rgb_fg = np.zeros_like(x_rgb)
-    if bg:
-        x_rgb_bg = 255 - x_rgb
-        x_rgb_bg *= get_std_color(cbg_means, targets, color_variance)
-    else:
-        x_rgb_bg = np.zeros_like(x_rgb)
-    x_rgb = x_rgb_fg + x_rgb_bg
-    x_rgb = np.clip(x_rgb, a_min=0.0, a_max=255.0)
-    color_data_x = x_rgb / 255.0
-    return color_data_x, targets_
diff --git a/bias_transfer/dataset/MNIST_IB/expansion.py b/bias_transfer/dataset/MNIST_IB/expansion.py
deleted file mode 100644
index 3cf89b8..0000000
--- a/bias_transfer/dataset/MNIST_IB/expansion.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import numpy as np
-
-
-def apply_expansion(source, target):
-    orig_shape = source.shape
-    expanded_batch = np.zeros((orig_shape[0], 1, 40, 40))
-    expanded_batch[:, :, 6:-6, 6:-6] = source
-    return expanded_batch, target
\ No newline at end of file
diff --git a/bias_transfer/dataset/MNIST_IB/noise.py b/bias_transfer/dataset/MNIST_IB/noise.py
deleted file mode 100644
index 76131ce..0000000
--- a/bias_transfer/dataset/MNIST_IB/noise.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import numpy as np
-
-
-def apply_gaussian_noise(batch, targets, severity=1):
-    if severity == -1:
-        severity = np.random.randint(1,6)
-    # adapted from https://github.com/google-research/mnist-c
-    c = [0.08, 0.12, 0.18, 0.26, 0.38][severity - 1]
-    return (
-        np.clip(batch + np.random.normal(size=batch.shape, scale=c), 0, 1),
-        targets,
-    )
diff --git a/bias_transfer/dataset/MNIST_IB/plot.py b/bias_transfer/dataset/MNIST_IB/plot.py
deleted file mode 100644
index 7926818..0000000
--- a/bias_transfer/dataset/MNIST_IB/plot.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import matplotlib.pyplot as plt
-
-
-def plot_batch(batch, targets, n_rows, n_cols, name="", file_type="png"):
-    batch = batch.transpose(0, 2, 3, 1)
-    fig, axs = plt.subplots(n_rows, n_cols)
-    if n_rows == 1:
-        axs = [axs]
-    for r in range(n_rows):
-        for c in range(n_cols):
-            axs[r][c].imshow(batch[r * n_cols + c].squeeze())
-            axs[r][c].set_title(int(targets[r * n_cols + c]))
-            axs[r][c].set_axis_off()
-    plt.show()
-    if name:
-        fig.savefig(
-            name + "." + file_type,
-            facecolor=fig.get_facecolor(),
-            edgecolor=fig.get_edgecolor(),
-            bbox_inches="tight",
-        )
diff --git a/bias_transfer/dataset/MNIST_IB/rotation.py b/bias_transfer/dataset/MNIST_IB/rotation.py
deleted file mode 100644
index 502824e..0000000
--- a/bias_transfer/dataset/MNIST_IB/rotation.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import numpy as np
-from scipy import ndimage
-
-
-def apply_rotation(source, target, regression=False):
-    angles = np.random.uniform(0, 360, source.shape[0])
-    for i in range(source.shape[0]):
-        source[i] = ndimage.rotate(source[i], angles[i], reshape=False, axes=(1, 2))
-    if regression:
-        target = angles
-    return source, target
diff --git a/bias_transfer/dataset/MNIST_IB/run_generation.py b/bias_transfer/dataset/MNIST_IB/run_generation.py
deleted file mode 100644
index ff97f8a..0000000
--- a/bias_transfer/dataset/MNIST_IB/run_generation.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import os
-import numpy as np
-from . import generate_and_save
-
-
-def main(dataset="FashionMNIST"):
-    for bias in ["clean",
-                 "color",
-                 "color_shuffle",
-                 "translation",
-                 "rotation",
-                 "rotation_regression",
-                 "noise",
-                 # "addition"
-                 ]:
-        generate_and_save(
-            bias, base_path="/work/data/image_classification/torchvision/",dataset=dataset
-        )
-        train_tensor = np.load(
-            os.path.join(
-                f"/work/data/image_classification/torchvision/{dataset}-IB",
-                f"{bias}_train_source.npy",
-            )
-        )
-        mean = np.mean(train_tensor, axis=(0, 2, 3))
-        std = np.std(train_tensor, axis=(0, 2, 3))
-        print(f"Saved {dataset}-{bias} with mean {mean} and std {std}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bias_transfer/dataset/MNIST_IB/shuffle.py b/bias_transfer/dataset/MNIST_IB/shuffle.py
deleted file mode 100644
index 7cb8631..0000000
--- a/bias_transfer/dataset/MNIST_IB/shuffle.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import numpy as np
-
-
-def apply_label_shuffle(source, target):
-    np.random.shuffle(target)  # to make this dataset random
-    return source, target
diff --git a/bias_transfer/dataset/MNIST_IB/translation.py b/bias_transfer/dataset/MNIST_IB/translation.py
deleted file mode 100644
index e906e28..0000000
--- a/bias_transfer/dataset/MNIST_IB/translation.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import numpy as np
-
-
-def apply_translation(source, target, std=5):
-    # 40x40 to follow https://www.cs.toronto.edu/~tijmen/affNIST/ for translation
-    offsets = np.clip(
-        np.random.normal(scale=std, size=source.shape[0] * 2), a_min=-6, a_max=6
-    )
-    offsets = offsets.astype(np.int)
-    x_offset, y_offset = offsets[: source.shape[0]], offsets[source.shape[0]:]
-    for b in range(source.shape[0]):
-        source[b, 0, :, :] = np.roll(
-            source[b, 0, :, :], (y_offset[b], x_offset[b]), axis=(0, 1)
-        )
-    return source, target
diff --git a/bias_transfer/dataset/__init__.py b/bias_transfer/dataset/__init__.py
deleted file mode 100644
index 0949899..0000000
--- a/bias_transfer/dataset/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .img_dataset_loader import img_dataset_loader
-from .neural_dataset_loader import neural_dataset_loader
-from .mtl_datasets_loader import mtl_datasets_loader
-from .regression_dataset_loader import regression_dataset_loader
-from .transferred_dataset_loader import transferred_dataset_loader
diff --git a/bias_transfer/dataset/img_dataset_loader.py b/bias_transfer/dataset/img_dataset_loader.py
deleted file mode 100644
index a3446db..0000000
--- a/bias_transfer/dataset/img_dataset_loader.py
+++ /dev/null
@@ -1,434 +0,0 @@
-import os
-import numpy as np
-import torch
-import torchvision
-import torchvision.transforms as transforms
-from torch.utils.data.dataset import ConcatDataset, Subset
-from torch.utils.data.sampler import SubsetRandomSampler
-from torchvision import datasets
-from bias_transfer.configs.dataset import ImageDatasetConfig
-from .MNIST_IB import generate_and_save
-from .dataset_classes.pkl_dataset import PklDataset
-from .dataset_classes.npy_dataset import NpyDataset
-from .utils import (
-    get_dataset,
-    create_ImageFolder_format,
-)
-
-DATASET_URLS = {
-    "TinyImageNet": "http://cs231n.stanford.edu/tiny-imagenet-200.zip",
-    "CIFAR10-Semisupervised": "1LTw3Sb5QoiCCN-6Y5PEKkq9C9W60w-Hi",
-    "CIFAR10-C": "https://zenodo.org/record/2535967/files/CIFAR-10-C.tar",
-    "CIFAR100-C": "https://zenodo.org/record/3555552/files/CIFAR-100-C.tar",
-    "TinyImageNet-C": "https://zenodo.org/record/2536630/files/Tiny-ImageNet-C.tar",
-    "TinyImageNet-ST": "https://informatikunihamburgde-my.sharepoint.com/:u:/g/personal/shahd_safarani_informatik_uni-hamburg_de/EZhUKKVXTvRHlqi2HXHaIjEBLmAv4tQP8olvdGNRoWrPqA?e=8kSrHI&download=1",
-    "ImageNet": None,
-    "ImageNet-C": {
-        "blur": "https://zenodo.org/record/2235448/files/blur.tar",
-        "digital": "https://zenodo.org/record/2235448/files/digital.tar",
-        "extra": "https://zenodo.org/record/2235448/files/extra.tar",
-        "noise": "https://zenodo.org/record/2235448/files/noise.tar",
-        "weather": "https://zenodo.org/record/2235448/files/weather.tar",
-    },
-}
-
-
-def img_dataset_loader(seed, **config):
-    """
-    Utility function for loading and returning train and valid
-    multi-process iterators over the CIFAR-10 dataset. A sample
-    9x9 grid of the images can be optionally displayed.
-    If using CUDA, num_workers should be set to 1 and pin_memory to True.
-    Params
-    ------
-    - data_dir: path directory to the dataset.
-    - batch_size: how many samples per batch to load.
-    - augment: whether to apply the data augmentation scheme
-      mentioned in the paper. Only applied on the train split.
-    - seed: fix seed for reproducibility.
-    - valid_size: percentage split of the training set used for
-      the validation set. Should be a float in the range [0, 1].
-    - shuffle: whether to shuffle the train/validation indices.
-    - show_sample: plot 9x9 sample grid of the dataset.
-    - num_workers: number of subprocesses to use when loading the dataset.
-    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
-      True if using GPU.
-    Returns
-    -------
-    - train_loader: training set iterator.
-    - valid_loader: validation set iterator.
-    """
-    config = ImageDatasetConfig.from_dict(config)
-    print("Loading dataset: {}".format(config.dataset_cls))
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-
-    transform_test, transform_train, transform_val = get_transforms(config)
-
-    error_msg = "[!] valid_size should be in the range [0, 1]."
-    assert (config.valid_size >= 0) and (config.valid_size <= 1), error_msg
-
-    (
-        train_dataset,
-        valid_dataset,
-        test_dataset,
-        c_test_datasets,
-        st_test_dataset,
-    ) = get_datasets(config, transform_test, transform_train, transform_val)
-
-    filters = [globals().get(f)(config, train_dataset) for f in config.filters]
-    datasets_ = [train_dataset, valid_dataset, test_dataset]
-    if config.add_corrupted_test:
-        for c_ds in c_test_datasets.values():
-            datasets_ += list(c_ds.values())
-    for ds in datasets_:
-        for filt in filters:
-            filt.apply(ds)
-
-    data_loaders = get_data_loaders(
-        st_test_dataset,
-        c_test_datasets,
-        config,
-        seed,
-        test_dataset,
-        train_dataset,
-        valid_dataset,
-    )
-
-    return data_loaders
-
-
-def get_transforms(config):
-    if config.dataset_cls == "ImageNet":
-        transform_train = [
-            transforms.RandomResizedCrop(config.input_size)
-            if config.apply_augmentation
-            else None,
-            transforms.RandomHorizontalFlip() if config.apply_augmentation else None,
-            transforms.Grayscale() if config.apply_grayscale else None,
-            transforms.ToTensor(),
-            transforms.Normalize(config.train_data_mean, config.train_data_std)
-            if config.apply_normalization
-            else None,
-        ]
-        transform_val = [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.Grayscale() if config.apply_grayscale else None,
-            transforms.ToTensor(),
-            transforms.Normalize(config.train_data_mean, config.train_data_std)
-            if config.apply_normalization
-            else None,
-        ]
-        transform_test = [  # TODO: we don't need resizing + cropping for IN-C!
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.Grayscale() if config.apply_grayscale else None,
-            transforms.ToTensor(),
-            transforms.Normalize(config.train_data_mean, config.train_data_std)
-            if config.apply_normalization
-            else None,
-        ]
-    else:
-        transform_train = [
-            transforms.ToPILImage()
-            if config.dataset_cls == "CIFAR10-Semisupervised"
-            or config.dataset_cls == "MNIST-IB"
-            else None,
-            transforms.RandomCrop(config.input_size, padding=4)
-            if config.apply_augmentation
-            else None,
-            transforms.RandomHorizontalFlip() if config.apply_augmentation else None,
-            transforms.RandomRotation(15)
-            if config.apply_augmentation and not "MNIST" in config.dataset_cls
-            else None,
-            transforms.Grayscale() if config.apply_grayscale else None,
-            transforms.ToTensor(),
-            transforms.Lambda(lambda x: x.repeat(3, 1, 1))
-            if config.convert_to_rgb
-            else None,
-            transforms.Normalize(config.train_data_mean, config.train_data_std)
-            if config.apply_normalization
-            else None,
-        ]
-        transform_val = [
-            transforms.ToPILImage()
-            if config.dataset_cls == "CIFAR10-Semisupervised"
-            or config.dataset_cls == "MNIST-IB"
-            else None,
-            transforms.Grayscale() if config.apply_grayscale else None,
-            transforms.ToTensor(),
-            transforms.Lambda(lambda x: x.repeat(3, 1, 1))
-            if config.convert_to_rgb
-            else None,
-            transforms.Normalize(config.train_data_mean, config.train_data_std)
-            if config.apply_normalization
-            else None,
-        ]
-        transform_test = [
-            transforms.ToPILImage() if config.dataset_cls == "MNIST-IB" else None,
-            transforms.Grayscale() if config.apply_grayscale else None,
-            transforms.ToTensor(),
-            transforms.Lambda(lambda x: x.repeat(3, 1, 1))
-            if config.convert_to_rgb
-            else None,
-            transforms.Normalize(config.train_data_mean, config.train_data_std)
-            if config.apply_normalization
-            else None,
-        ]
-    transform_test = transforms.Compose(
-        list(filter(lambda x: x is not None, transform_test))
-    )
-    transform_val = transforms.Compose(
-        list(filter(lambda x: x is not None, transform_val))
-    )
-    transform_train = transforms.Compose(
-        list(filter(lambda x: x is not None, transform_train))
-    )
-    return transform_test, transform_train, transform_val
-
-
-def get_datasets(config, transform_test, transform_train, transform_val):
-    if (
-        config.dataset_cls in list(torchvision.datasets.__dict__.keys())
-        and config.dataset_cls != "ImageNet"
-    ):
-        dataset_cls = eval("torchvision.datasets." + config.dataset_cls)
-        kwargs = {
-            "root": config.data_dir,
-            "transform": transform_train,
-            "download": True,
-        }
-
-        if config.dataset_cls == "SVHN":
-            kwargs["split"] = "train"
-        else:
-            kwargs["train"] = True
-        train_dataset = dataset_cls(**kwargs)
-
-        kwargs["transform"] = transform_val
-        valid_dataset = dataset_cls(**kwargs)
-
-        kwargs["transform"] = transform_test
-        if config.dataset_cls == "SVHN":
-            kwargs["split"] = "test"
-        else:
-            kwargs["train"] = False
-        test_dataset = dataset_cls(**kwargs)
-    elif config.dataset_cls == "MNIST-IB":
-        dataset_dir = os.path.join(config.data_dir, config.dataset_sub_cls + "-IB")
-        generate_and_save(
-            config.bias, base_path=config.data_dir, dataset=config.dataset_sub_cls
-        )
-        train_dataset = NpyDataset(
-            f"{config.bias}_train_source.npy",
-            f"{config.bias}_train_target.npy",
-            root=dataset_dir,
-            transform=transform_train,
-            target_type=torch.float32 if "regression" in config.bias else torch.long,
-        )
-        valid_dataset = NpyDataset(
-            f"{config.bias}_train_source.npy",
-            f"{config.bias}_train_target.npy",
-            root=dataset_dir,
-            transform=transform_val,
-            target_type=torch.float32 if "regression" in config.bias else torch.long,
-        )
-        test_dataset = NpyDataset(
-            f"{config.bias}_test_source.npy",
-            f"{config.bias}_test_target.npy",
-            root=dataset_dir,
-            transform=transform_test,
-            target_type=torch.float32 if "regression" in config.bias else torch.long,
-        )
-    else:
-        dataset_dir = get_dataset(
-            DATASET_URLS[config.dataset_cls],
-            config.data_dir,
-            dataset_cls=config.dataset_cls,
-        )
-
-        train_dir = os.path.join(dataset_dir, "train")
-        if config.dataset_cls == "CIFAR10-Semisupervised":
-            train_dataset = PklDataset(
-                train_dir, transform=transform_train, root=config.data_dir
-            )
-            valid_dataset = PklDataset(
-                train_dir, transform=transform_val, root=config.data_dir
-            )
-            dataset_cls = torchvision.datasets.CIFAR10
-            test_dataset = dataset_cls(
-                root=config.data_dir,
-                train=False,
-                transform=transform_test,
-            )
-        else:
-            if config.dataset_cls != "ImageNet":
-                create_ImageFolder_format(dataset_dir)
-            val_dir = os.path.join(dataset_dir, "val", "images")
-            train_dataset = datasets.ImageFolder(train_dir, transform=transform_train)
-            valid_dataset = datasets.ImageFolder(train_dir, transform=transform_val)
-            test_dataset = datasets.ImageFolder(val_dir, transform=transform_test)
-
-    st_test_dataset = None
-    if config.add_stylized_test:
-        st_dataset_dir = get_dataset(
-            DATASET_URLS[config.dataset_cls + "-ST"],
-            config.data_dir,
-            dataset_cls=config.dataset_cls + "-ST",
-        )
-        st_test_dataset = datasets.ImageFolder(st_dataset_dir, transform=transform_test)
-
-    c_test_datasets = None
-    if config.add_corrupted_test:
-        urls = DATASET_URLS[config.dataset_cls + "-C"]
-        if not isinstance(urls, dict):
-            urls = {"default": urls}
-        for key, url in urls.items():
-            dataset_dir = get_dataset(
-                url,
-                config.data_dir,
-                dataset_cls=config.dataset_cls + "-C",
-            )
-
-            c_test_datasets = {}
-            for c_category in os.listdir(dataset_dir):
-                if config.dataset_cls in ("CIFAR10", "CIFAR100"):
-                    if c_category == "labels.npy" or not c_category.endswith(".npy"):
-                        continue
-                    c_test_datasets[c_category[:-4]] = {}
-                    for c_level in range(1, 6):
-                        start = (c_level - 1) * 10000
-                        end = c_level * 10000
-                        c_test_datasets[c_category[:-4]][c_level] = NpyDataset(
-                            samples=c_category,
-                            targets="labels.npy",
-                            root=dataset_dir,
-                            start=start,
-                            end=end,
-                            transform=transform_test,
-                        )
-                else:
-                    if not os.path.isdir(os.path.join(dataset_dir, c_category)):
-                        continue
-                    c_test_datasets[c_category] = {}
-                    for c_level in os.listdir(os.path.join(dataset_dir, c_category)):
-                        c_test_datasets[c_category][
-                            int(c_level)
-                        ] = datasets.ImageFolder(
-                            os.path.join(dataset_dir, c_category, c_level),
-                            transform=transform_test,
-                        )
-    return train_dataset, valid_dataset, test_dataset, c_test_datasets, st_test_dataset
-
-
-def get_data_loaders(
-    st_test_dataset,
-    c_test_datasets,
-    config,
-    seed,
-    test_dataset,
-    train_dataset,
-    valid_dataset,
-):
-    num_train = len(train_dataset)
-    indices = list(range(num_train))
-    if config.use_c_test_as_val:  # Use valid_size of the c_test set for validation
-        train_sampler = SubsetRandomSampler(indices)
-        datasets = []
-        val_indices = []
-        start_idx = 0
-        for c_category in c_test_datasets.keys():
-            if c_category not in (
-                "speckle_noise",
-                "gaussian_blur",
-                "spatter",
-                "saturate",
-            ):
-                continue
-            for dataset in c_test_datasets[c_category].values():
-                num_val = len(dataset)
-                indices = list(range(start_idx, start_idx + num_val))
-                split = int(np.floor(config.valid_size * num_val))
-                if config.shuffle:
-                    np.random.shuffle(indices)
-                val_indices += indices[:split]
-                datasets.append(dataset)
-                start_idx += num_val
-        valid_dataset = ConcatDataset(datasets)
-        valid_sampler = SubsetRandomSampler(val_indices)
-    else:  # Use valid_size of the train set for validation
-        split = int(np.floor(config.valid_size * num_train))
-        if config.shuffle:
-            np.random.seed(seed)
-            np.random.shuffle(indices)
-        train_idx, valid_idx = indices[split:], indices[:split]
-        if config.train_subset:
-            subset_split = int(np.floor(config.train_subset * len(train_idx)))
-            train_idx = train_idx[:subset_split]
-        if config.shuffle:
-            train_sampler = SubsetRandomSampler(train_idx)
-            valid_sampler = SubsetRandomSampler(valid_idx)
-        else:
-            train_dataset = Subset(train_dataset, train_idx)
-            valid_dataset = Subset(train_dataset, valid_idx)
-            train_sampler = None
-            valid_sampler = None
-    train_loader = torch.utils.data.DataLoader(
-        train_dataset,
-        batch_size=config.batch_size,
-        sampler=train_sampler,
-        num_workers=config.num_workers,
-        pin_memory=config.pin_memory,
-        shuffle=False,
-    )
-    valid_loader = torch.utils.data.DataLoader(
-        valid_dataset,
-        batch_size=config.batch_size,
-        sampler=valid_sampler,
-        num_workers=config.num_workers,
-        pin_memory=config.pin_memory,
-        shuffle=False,
-    )
-    test_loader = torch.utils.data.DataLoader(
-        test_dataset,
-        batch_size=config.batch_size,
-        num_workers=config.num_workers,
-        pin_memory=config.pin_memory,
-        shuffle=True,
-    )
-    task_key = (
-        "regression"
-        if config.bias is not None and "regression" in config.bias
-        else "img_classification"
-    )
-    data_loaders = {
-        "train": {task_key: train_loader},
-        "validation": {task_key: valid_loader},
-        "test": {task_key: test_loader},
-    }
-
-    if config.add_stylized_test:
-        st_test_loader = torch.utils.data.DataLoader(
-            st_test_dataset,
-            batch_size=config.batch_size,
-            num_workers=config.num_workers,
-            pin_memory=config.pin_memory,
-            shuffle=False,
-        )
-        data_loaders["st_test"] = st_test_loader
-
-    if config.add_corrupted_test:
-        c_test_loaders = {}
-        for c_category in c_test_datasets.keys():
-            c_test_loaders[c_category] = {}
-            for c_level, dataset in c_test_datasets[c_category].items():
-                c_test_loaders[c_category][c_level] = torch.utils.data.DataLoader(
-                    dataset,
-                    batch_size=config.batch_size,
-                    num_workers=config.num_workers,
-                    pin_memory=config.pin_memory,
-                    shuffle=True,
-                )
-        data_loaders["c_test"] = {"img_classification": c_test_loaders}
-    return data_loaders
diff --git a/bias_transfer/dataset/mtl_datasets_loader.py b/bias_transfer/dataset/mtl_datasets_loader.py
deleted file mode 100644
index 10c34be..0000000
--- a/bias_transfer/dataset/mtl_datasets_loader.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from bias_transfer.configs.dataset import MTLDatasetsConfig
-from nnfabrik.builder import resolve_data
-from .img_dataset_loader import img_dataset_loader
-from .neural_dataset_loader import neural_dataset_loader
-
-#
-# def mtl_datasets_loader(seed, **config):
-#     neural_dataset_config = config.pop("neural_dataset_config")
-#     img_dataset_config = config.pop("img_dataset_config")
-#
-#     neural_dataset_config.pop("seed")
-#
-#     neural_dataset_loaders = neural_dataset_loader(seed, **neural_dataset_config)
-#     img_dataset_loaders = img_dataset_loader(seed, **img_dataset_config)
-#
-#     data_loaders = neural_dataset_loaders
-#     data_loaders["train"]["img_classification"] = img_dataset_loaders["train"][
-#         "img_classification"
-#     ]
-#     data_loaders["validation"]["img_classification"] = img_dataset_loaders[
-#         "validation"
-#     ]["img_classification"]
-#     data_loaders["test"]["img_classification"] = img_dataset_loaders["test"][
-#         "img_classification"
-#     ]
-#     if "c_test" in img_dataset_loaders:
-#         data_loaders["c_test"] = img_dataset_loaders["c_test"]
-#     return data_loaders
-
-def update(to_update, new_entries, prefix=""):
-    for k,v in new_entries.items():
-        if prefix:
-            k = prefix + "_" + k
-        to_update[k] = v
-
-def mtl_datasets_loader(seed, **config):
-    mtl_config = MTLDatasetsConfig.from_dict(config)
-    mtl_data_loaders = {"train": {}, "validation": {}, "test": {}}
-    for prefix, dataset_config in mtl_config.items():
-        dataset_config.seed = seed
-        dataset_fn = resolve_data(dataset_config.fn)
-        data_loaders = dataset_fn(**dataset_config.to_dict())
-        update(mtl_data_loaders["train"], data_loaders["train"], prefix)
-        update(mtl_data_loaders["validation"], data_loaders["validation"], prefix)
-        update(mtl_data_loaders["test"], data_loaders["test"], prefix)
-        if "c_test" in data_loaders:
-            if "c_test" not in mtl_data_loaders:
-                mtl_data_loaders["c_test"] = {}
-            update(mtl_data_loaders["c_test"], data_loaders["c_test"], prefix)
-        if "st_test" in data_loaders:
-            if "st_test" not in mtl_data_loaders:
-                mtl_data_loaders["st_test"] = {}
-            update(mtl_data_loaders["st_test"], data_loaders["st_test"], prefix)
-    return mtl_data_loaders
diff --git a/bias_transfer/dataset/neural_dataset_loader.py b/bias_transfer/dataset/neural_dataset_loader.py
deleted file mode 100644
index f6b8b1c..0000000
--- a/bias_transfer/dataset/neural_dataset_loader.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import numpy as np
-import torch
-
-from nnfabrik import builder
-import os
-from os import listdir
-from os.path import isfile, join
-
-
-def neural_dataset_loader(seed, **config):
-    config.pop("comment", None)
-    data_dir = config.pop("data_dir", None)
-    neuronal_data_path = os.path.join(data_dir, "neuronal_data/")
-    config["neuronal_data_files"] = [
-        neuronal_data_path + f
-        for f in listdir(neuronal_data_path)
-        if isfile(join(neuronal_data_path, f))
-    ]
-    config["image_cache_path"] = os.path.join(data_dir, "images/individual")
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-    dataset_fn = "nnvision.datasets.monkey_static_loader"
-    data_loaders = builder.get_data(dataset_fn, config)
-    dataloaders = {
-        "train": data_loaders["train"],
-        "validation": {"neural": data_loaders["validation"]},
-        "test": {"neural": data_loaders["test"]},
-    }
-    return dataloaders
diff --git a/bias_transfer/dataset/regression_dataset_loader.py b/bias_transfer/dataset/regression_dataset_loader.py
deleted file mode 100644
index 5bdeb43..0000000
--- a/bias_transfer/dataset/regression_dataset_loader.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import h5py
-import numpy as np
-import torch
-import torch.utils.data as Data
-from sklearn.datasets import fetch_openml
-
-from bias_transfer.configs.dataset import Regression
-
-
-def load_mauna_loa_atmospheric_co2():
-    ml_data = fetch_openml(data_id=41187)
-    months = []
-    ppmv_sums = []
-    counts = []
-
-    y = ml_data.data[:, 0]
-    m = ml_data.data[:, 1]
-    month_float = y + (m - 1) / 12
-    ppmvs = ml_data.target
-
-    for month, ppmv in zip(month_float, ppmvs):
-        if not months or month != months[-1]:
-            months.append(month)
-            ppmv_sums.append(ppmv)
-            counts.append(1)
-        else:
-            # aggregate monthly sum to produce average
-            ppmv_sums[-1] += ppmv
-            counts[-1] += 1
-
-    months = np.asarray(months).reshape(-1, 1)
-    avg_ppmvs = np.asarray(ppmv_sums) / counts
-    # normalize:
-    avg_ppmvs -= np.mean(avg_ppmvs)
-    avg_ppmvs /= np.std(avg_ppmvs)
-    X_plot = months
-    Y_plot = avg_ppmvs
-    X_train = np.concatenate((X_plot[:120], X_plot[150:300], X_plot[380:450]))
-    Y_train = np.concatenate((Y_plot[:120], Y_plot[150:300], Y_plot[380:450]))
-
-    return X_plot, Y_plot, X_train, Y_train
-
-
-def load_co2():
-    f = h5py.File("co2_data.h5", "r")
-    data_train = np.concatenate((f["data"].value, f["label"].value), axis=1)
-    f.close()
-    X_train = data_train[:, 0].reshape(-1, 1)
-    Y_train = data_train[:, 1].reshape(-1)
-
-    X_plot = np.concatenate((X_train, np.arange(1.73, 3.51, 0.01).reshape(-1, 1)))
-    Y_plot = np.concatenate((Y_train, np.zeros((int((3.51 - 1.73) // 0.01 + 1),))))
-    X_train = np.concatenate((X_train[:120], X_train[150:]))
-    Y_train = np.concatenate((Y_train[:120], Y_train[150:]))
-
-    return X_plot, Y_plot, X_train, Y_train
-
-
-def load_sinusoid_data(noisy=False, train_range=10):
-    def f(x):
-        return (np.sin(x)).ravel()
-
-    rng = np.random.RandomState(0)
-    X_plot = np.linspace(-10, 40, 1000).reshape(-1, 1)
-    X_train = np.sort(train_range * rng.rand(10 * train_range, 1), axis=0)
-    # X_train = np.concatenate((X_train, (np.sort(10 * rng.rand(100, 1) + 20, axis=0))))
-    Y_train = f(X_train)
-    Y_plot = f(X_plot)
-    if noisy:
-        Y_train = Y_train + 1 * (0.5 - rng.rand(X_train.shape[0]))
-    return X_plot, Y_plot, X_train, Y_train
-
-
-def regression_dataset_loader(seed, **config):
-    config = Regression.from_dict(config)
-    print("Loading dataset: {}".format(config.dataset_cls))
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-
-    error_msg = "[!] valid_size should be in the range [0, 1]."
-    assert (config.valid_size >= 0) and (config.valid_size <= 1), error_msg
-
-    if config.dataset_cls == "co2":
-        X_plot, Y_plot, X_train, Y_train = load_co2()
-    elif config.dataset_cls == "co2_original":
-        X_plot, Y_plot, X_train, Y_train = load_mauna_loa_atmospheric_co2()
-    else:
-        X_plot, Y_plot, X_train, Y_train = load_sinusoid_data(
-            noisy=config.noisy, train_range=config.train_range
-        )
-
-    train_len = X_train.shape[0]
-    valid_start = int(train_len * (1.0 - config.valid_size))
-    train_dataset = Data.TensorDataset(
-        torch.tensor(X_train[:valid_start], dtype=torch.float),
-        torch.tensor(Y_train[:valid_start], dtype=torch.float),
-    )
-    valid_dataset = Data.TensorDataset(
-        torch.tensor(X_train[valid_start:], dtype=torch.float),
-        torch.tensor(Y_train[valid_start:], dtype=torch.float),
-    )
-    test_dataset = Data.TensorDataset(
-        torch.tensor(X_plot, dtype=torch.float),
-        torch.tensor(Y_plot, dtype=torch.float),
-    )
-
-    train_loader = torch.utils.data.DataLoader(
-        train_dataset,
-        batch_size=config.batch_size,
-        num_workers=config.num_workers,
-        pin_memory=config.pin_memory,
-        shuffle=True,
-    )
-    valid_loader = torch.utils.data.DataLoader(
-        valid_dataset,
-        batch_size=config.batch_size,
-        num_workers=config.num_workers,
-        pin_memory=config.pin_memory,
-        shuffle=False,
-    )
-    test_loader = torch.utils.data.DataLoader(
-        test_dataset,
-        batch_size=config.batch_size,
-        num_workers=config.num_workers,
-        pin_memory=config.pin_memory,
-        shuffle=False,
-    )
-
-    data_loaders = {
-        "train": {"regression": train_loader},
-        "validation": {"regression": valid_loader},
-        "test": {"regression": test_loader},
-    }
-
-    return data_loaders
diff --git a/bias_transfer/dataset/transferred_dataset_loader.py b/bias_transfer/dataset/transferred_dataset_loader.py
deleted file mode 100644
index 5eed0ab..0000000
--- a/bias_transfer/dataset/transferred_dataset_loader.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import torch
-from torch.utils.data import TensorDataset
-
-from bias_transfer.dataset import img_dataset_loader
-from bias_transfer.dataset.dataset_classes.combined_dataset import ParallelDataset
-from bias_transfer.dataset.dataset_classes.npy_dataset import NpyDataset
-
-def load_npy(postfix, data_key, transfer_data, data_loaders, main_data_loader):
-    transferred_dataset = NpyDataset(
-        samples=transfer_data["source" + postfix],
-        targets=transfer_data["target" + postfix],
-    )
-    data_loaders["train"][data_key] = torch.utils.data.DataLoader(
-        dataset=transferred_dataset,
-        batch_size=main_data_loader.batch_size,
-        num_workers=main_data_loader.num_workers,
-        pin_memory=main_data_loader.pin_memory,
-        shuffle=True,
-    )
-
-
-def transferred_dataset_loader(seed, primary_dataset_fn=img_dataset_loader, **config):
-    transfer_data_file = config.pop("transfer_data")
-    transfer_data = {k: transfer_data_file[k] for k in transfer_data_file.files}
-
-    data_loaders = primary_dataset_fn(seed, **config)
-    main_task = next(iter(data_loaders["train"].keys()))
-    main_data_loader = data_loaders["train"][main_task]
-    main_dataset = main_data_loader.dataset
-    if "covariance" in transfer_data:
-        data_loaders["covariance"] = transfer_data.pop("covariance")
-
-    if "source_cs" in transfer_data:  # we have a coreset
-        if config.get("train_on_coreset"):
-            load_npy("_cs", main_task, transfer_data, data_loaders, main_data_loader)
-        else:
-            if config.get("train_on_reduced_data"):
-                load_npy("", main_task, transfer_data, data_loaders, main_data_loader)
-            if config.get("load_coreset"):
-                load_npy("_cs", f"{main_task}_cs", transfer_data, data_loaders, main_data_loader)
-    else:
-        datasets = {}
-        for rep_name, rep_data in transfer_data.items():
-            datasets[rep_name] = TensorDataset(torch.from_numpy(rep_data))
-        if "source" in transfer_data:  # we have input data
-            source_ds = datasets.pop("source")
-            transfer_dataset = ParallelDataset(
-                source_datasets={"img": source_ds}, target_datasets=datasets
-            )
-            transfer_data_loader = torch.utils.data.DataLoader(
-                dataset=transfer_dataset,
-                batch_size=main_data_loader.batch_size,
-                sampler=main_data_loader.sampler,
-                num_workers=main_data_loader.num_workers,
-                pin_memory=main_data_loader.pin_memory,
-                shuffle=False,
-            )
-            data_loaders["train"]["transfer"] = transfer_data_loader
-        else:  # we don't have input data -> only targets that are presented in parallel to class-labels
-            datasets["class"] = main_dataset
-            combined_dataset = ParallelDataset(
-                source_datasets={"img": main_dataset}, target_datasets=datasets
-            )
-            combined_data_loader = torch.utils.data.DataLoader(
-                dataset=combined_dataset,
-                batch_size=main_data_loader.batch_size,
-                sampler=main_data_loader.sampler,
-                num_workers=main_data_loader.num_workers,
-                pin_memory=main_data_loader.pin_memory,
-                shuffle=False,
-            )
-            data_loaders["train"][main_task] = combined_data_loader
-    return data_loaders
diff --git a/bias_transfer/gp/gp_regression.py b/bias_transfer/gp/gp_regression.py
deleted file mode 100644
index 7ce3544..0000000
--- a/bias_transfer/gp/gp_regression.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import numpy as np
-from scipy.optimize import minimize
-from functools import partial
-
-
-def posterior_predictive(X, X_train, Y_train, kernel, **opts):
-    K = kernel(X_train, X_train, **opts)
-    K_s = kernel(X_train, X, **opts)
-    K_ss = kernel(X, X, **opts)
-
-    K_inv = np.linalg.inv(K)
-
-    mu_s = K_s.T @ K_inv @ Y_train
-    cov_s = K_ss - K_s.T @ K_inv @ K_s
-
-    return mu_s, cov_s
-
-
-def optimize_hyper_params(kernel,X_train, Y_train):
-    # -log liklihood
-    def nll_fn(x, y):
-        def step(theta):
-            K = kernel(x, x, sigma=theta[0], l=theta[1], sigma_f=theta[2], p=theta[3])
-            return np.sum(np.log(np.diagonal(np.linalg.cholesky(K)))) + \
-                   0.5 * y.T @ np.linalg.inv(K) @ y + \
-                   0.5 * len(x) * np.log(2 * np.pi)
-
-        return step
-
-    # minimize -log liklihood
-    res = minimize(nll_fn(X_train, Y_train), [0.01, 1, 1, 5.0],
-                   bounds=((1e-5, 1e1), (1e-5, None), (1e-5, None), (1e-2, 1e1)),
-                   method='L-BFGS-B')
-
-    sigma_opt, l_opt, sigma_f_opt, p_opt = res.x
-    fitted_kernel = partial(kernel, sigma_f=sigma_f_opt, l=l_opt, sigma=sigma_opt, p=p_opt)
-    return fitted_kernel
-    # mu_s, cov_s = posterior_predictive(X_plot, X_train, Y_train_noisy, l=l_opt, sigma_f=sigma_f_opt, p=p_opt,
-    #                                    sigma=sigma_opt)
\ No newline at end of file
diff --git a/bias_transfer/gp/kernels.py b/bias_transfer/gp/kernels.py
deleted file mode 100644
index b3d206d..0000000
--- a/bias_transfer/gp/kernels.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from scipy.spatial.distance import cdist
-import numpy as np
-import torch
-from .nn_kernel import compute_cov_matrix
-
-
-def linear(x1, x2, sigma_b=1, sigma_v=1, c=0, **kwargs):
-    """
-    Linear Kernel: $k(x_1,x_2) =\sigma_b^2 + \sigma_v^2 (x_1 - c)(x_2-c)$
-    """
-    return sigma_b ** 2 + sigma_v ** 2 * np.inner(x1 - c, x2 - c)
-
-
-def rbf(x1, x2, l=1, sigma_f=1, **kwargs):
-    """
-    RBF Kernel: $k(x_1,x_2) =\sigma^2 \exp\left( - \frac{||x_1-x_2||^2}{2l^2} \right)$
-    """
-    dists = cdist(x1 / l, x2 / l, metric="sqeuclidean")
-    return sigma_f ** 2 * np.exp(-0.5 * dists)
-
-
-def periodic(x1, x2, l=1.0, sigma_f=1.0, p=5.0, **kwargs):
-    """
-    Periodic Kernel: $k(x_1,x_2) =\sigma^2 \exp\left( - \frac{2\sin^2(\pi|x_1-x_2|/p)}{l^2} \right)$
-    """
-    dists = cdist(x1, x2, metric="euclidean")
-    return sigma_f ** 2 * np.exp(-2 * (np.sin(np.pi / p * dists) / l) ** 2)
-
-
-def locally_periodic(x1, x2, l=1, sigma_f=0.5, p=2.0):
-    """
-    Locally Periodic Kernel: $k(x_1,x_2) =\sigma^2 \exp\left( - \frac{2\sin^2(\pi|x_1-x_2|/p)}{l^2} \right) \exp\left(-\frac{||x_1-x_2||^2}{2l^2}\right)$
-    """
-    return periodic(x1, x2, l, sigma_f, p) * rbf(x1, x2, l, sigma_f) / sigma_f ** 2
-
-
-def white_noise(x1, x2, sigma=0.1, **kwargs):
-    """
-    White Noise Kernel: $k(x_1,x_2) = \sigma^2 \cdot I_n$
-    """
-    if x1 is x2:
-        return sigma ** 2 * np.eye(len(x1))
-    else:
-        return np.zeros((len(x1), len(x2)))
-
-
-def add_white_noise(kernel):
-    return lambda x1, x2, sigma_noise=0.1, **opts: white_noise(x1, x2, sigma_noise) + kernel(
-        x1, x2, **opts
-    )
diff --git a/bias_transfer/gp/nn_kernel.py b/bias_transfer/gp/nn_kernel.py
deleted file mode 100644
index 41d4eb4..0000000
--- a/bias_transfer/gp/nn_kernel.py
+++ /dev/null
@@ -1,171 +0,0 @@
-from functools import partial
-
-import numpy as np
-import torch
-from scipy.optimize import minimize
-from torch.autograd import Variable
-from tqdm import tqdm
-
-
-def compute_cov_matrix(x1, x2, sigma=None):
-    x1_flat = x1.reshape((x1.shape[0], -1))
-    centered1 = x1_flat  # - x1_flat.mean(axis=1).reshape((-1, 1))
-    x2_flat = x2.reshape((x2.shape[0], -1))
-    centered2 = x2_flat  # - x2_flat.mean(axis=1).reshape((-1, 1))
-    if sigma is not None:
-        result = (
-            centered1
-            @ sigma
-            @ centered2.T
-            # / np.outer(np.linalg.norm(centered1, 2, axis=1), np.linalg.norm(centered2, 2, axis=1))
-        )  # see https://de.mathworks.com/help/images/ref/corr2.html
-    else:
-        result = (
-            centered1
-            @ centered2.T
-            # / np.outer(np.linalg.norm(centered1, 2, axis=1), np.linalg.norm(centered2, 2, axis=1))
-        )  # see https://de.mathworks.com/help/images/ref/corr2.html
-    return result
-
-
-def nn_kernel(x1, x2, net, train_reps=None, weights=None, device="cpu", sigma=None):
-    def get_reps(x):
-        if np.count_nonzero(x) == 0:
-            phi = train_reps
-        else:
-            x = torch.tensor(x, dtype=torch.float).to(device)
-            phi = net[:-1](x).detach().cpu().numpy()
-        return phi
-
-    phi1 = get_reps(x1)
-    phi2 = get_reps(x2)
-    RSM = compute_cov_matrix(
-        phi1,
-        phi2,
-        sigma=sigma
-        if np.count_nonzero(x1) != 0 and np.count_nonzero(x2) != 0
-        else None,
-    )  # .cpu().numpy()
-    if np.count_nonzero(x2) == 0 and weights is not None:
-        RSM = RSM @ weights
-    elif np.count_nonzero(x1) == 0 and weights is not None:
-        RSM = weights @ RSM
-    return RSM
-
-
-def optimize_noise(kernel, X_train, Y_train):
-    # -log liklihood
-    def nll_fn(x, y):
-        def step(theta):
-            K = kernel(x, x, sigma_noise=theta[0])
-            return (
-                np.sum(np.log(np.diagonal(np.linalg.cholesky(K))))
-                + 0.5 * y.T @ np.linalg.inv(K) @ y
-                + 0.5 * len(x) * np.log(2 * np.pi)
-            )
-
-        return step
-
-    # minimize -log liklihood
-    res = minimize(
-        nll_fn(X_train, Y_train), [0.01], bounds=((1e-5, 1e1),), method="L-BFGS-B"
-    )
-
-    sigma_noise_opt = res.x
-    fitted_kernel = partial(kernel, sigma_noise=sigma_noise_opt)
-    return fitted_kernel
-
-
-def get_nn_eigen_kernel(net, device):
-    v = net[-1].weight.detach().cpu().numpy().T
-    # v -= np.mean(v)
-    sigma = v @ v.T
-    eig_vals, eig_vecs = np.linalg.eigh(sigma)
-    # eig_vals (n) with possibly complex entries
-    # eig_vecs (n x n) where [:,j] corresponds to eig_vals[j]
-    # sort:
-    eig_sorting = np.argsort(-eig_vals)
-    eig_vals = eig_vals[eig_sorting[:1]]
-    eig_vecs = eig_vecs[:, eig_sorting[:1]]
-    weights = np.diag(eig_vals)
-    kernel = partial(
-        nn_kernel,
-        net=net,
-        device=device,
-        train_reps=eig_vecs.T,
-        weights=weights,
-        sigma=sigma,
-    )
-    base_point_preds = eig_vecs.T @ v
-    # base_points = inverse_computation(net, torch.tensor(eig_vecs.T, device=device))
-    return kernel, base_point_preds, None
-
-
-def inverse_computation(net, out_vecs):
-    print(net)
-    print(net[1:-1])
-    print(net[:1])
-    first_layer_out = net[1:-1](out_vecs, inverse=True).detach()
-    print("first_layer", first_layer_out)
-    x = Variable(
-        100 * torch.randn(first_layer_out.shape[0], 1).cuda(), requires_grad=True
-    )
-    params = net.parameters()
-    optim = torch.optim.Adam([x], 0.001)
-    for param in params:
-        param.requires_grad = False
-    if hasattr(
-        tqdm, "_instances"
-    ):  # To have tqdm output without line-breaks between steps
-        tqdm._instances.clear()
-    net.train()
-    t = tqdm(range(100))
-    for batch in t:
-        y = net[:1](x)
-        loss = torch.mean((first_layer_out - y) ** 2)
-        optim.zero_grad()
-        loss.backward()
-        optim.step()
-        t.set_postfix(
-            loss=loss.item(),
-            eig_vec_0=first_layer_out[0][:4].cpu().numpy(),
-            phi_0=y[0][:4].detach().cpu().numpy(),
-        )
-    net.eval()
-    return x.detach().cpu().numpy()
-
-
-def optimize_base_points(net):
-    v = net[-1].weight.detach().T
-    sigma = v @ v.T
-    eig_vals, eig_vecs = torch.eig(sigma, eigenvectors=True)
-    # eig_vals (n x 2) with entries (real,imaginary)
-    # eig_vecs (n x n) where [:,j] corresponds to eig_vals[j]
-    eig_vecs = eig_vecs.T
-    x = Variable(15 * torch.randn(eig_vecs.shape[0], 1).cuda(), requires_grad=True)
-    params = net.parameters()
-    optim = torch.optim.Adam([x], 0.001)
-    for param in params:
-        param.requires_grad = False
-    if hasattr(
-        tqdm, "_instances"
-    ):  # To have tqdm output without line-breaks between steps
-        tqdm._instances.clear()
-    t = tqdm(range(100))
-    net.train()
-    for batch in t:
-        y = net[:-1](x)
-        loss = torch.mean((eig_vecs - y) ** 2)
-        optim.zero_grad()
-        loss.backward()
-        optim.step()
-        net.eval()
-        phi = net[:-1](x)
-        t.set_postfix(
-            loss=loss.item(),
-            eig_vec_0=eig_vecs[0][:4].cpu().numpy(),
-            phi_0=phi[0][:4].detach().cpu().numpy(),
-        )
-        net.train()
-    net.eval()
-    return x.detach().cpu().numpy(), eig_vals.cpu().numpy()
diff --git a/bias_transfer/gp/utils.py b/bias_transfer/gp/utils.py
deleted file mode 100644
index 6a25e88..0000000
--- a/bias_transfer/gp/utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-
-def plot_gp(mu, cov, X, samples=[], Y=None, X_train=None, Y_train=None, save=""):
-    if Y is not None:
-        plt.plot(X, Y, color='orange', lw=2, label='True')
-    if X_train is not None and Y_train is not None:
-        plt.plot(X_train, Y_train, color='red', label="Traning data")
-    X = X.reshape(-1)
-    mu = mu.reshape(-1)
-
-    # cov *= 100000
-    # gp_samples = np.random.multivariate_normal(mu, cov, size=1000)
-    # uncertainty = 2 * np.std(gp_samples, axis=0)
-    # 95% confidence interval
-    uncertainty = 1.96 * np.sqrt(np.abs(np.diag(cov)))
-
-    plt.fill_between(X, mu + uncertainty, mu - uncertainty, alpha=0.4)
-    plt.plot(X, mu, label='Mean')
-
-    for i, sample in enumerate(samples):
-        plt.plot(X, sample, lw=1, ls='--', label='sample_{}'.format(i))
-
-    plt.legend()
-    if save:
-        fig = plt.gcf()
-        fig.savefig(save, dpi=200)
-
-def plot_kernel(kernel, x):
-    K_plot = kernel(x,x)
-    plt.imshow(K_plot)
-    # if np.count_nonzero(x) > 0:
-    #     _ = plt.xticks(np.arange(0,x.shape[0], 15),x[::15,0].astype(np.int))
-    #     _ = plt.yticks(np.arange(0,x.shape[0], 15),x[::15,0].astype(np.int))
-    plt.colorbar()
diff --git a/bias_transfer/models/__init__.py b/bias_transfer/models/__init__.py
deleted file mode 100644
index afe8b24..0000000
--- a/bias_transfer/models/__init__.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import torch
-import numpy as np
-
-from bias_transfer.configs.model import (
-    Classification,
-    MTL,
-    Regression,
-)
-from bias_transfer.models.resnet import resnet_builder
-from bias_transfer.models.wrappers.noise_adv import NoiseAdvWrapper
-from bias_transfer.models.utils import get_model_parameters
-from bias_transfer.models.vgg import vgg_builder
-from torch.hub import load_state_dict_from_url
-
-from nnfabrik.utility.nn_helpers import load_state_dict
-from nnvision.models.models import se_core_gauss_readout, se_core_point_readout
-from .lenet import lenet_builder
-from .lenet_bayesian import lenet_builder as bayes_builder
-from .lenet_frcl import lenet_builder as frcl_builder
-from .mlp import MLP
-from .wrappers import *
-
-
-def neural_cnn_builder(data_loaders, seed: int = 1000, **config):
-    config.pop("comment", None)
-    readout_type = config.pop("readout_type", None)
-    if readout_type == "point":
-        model = se_core_point_readout(dataloaders=data_loaders, seed=seed, **config)
-    elif readout_type == "gauss":
-        model = se_core_gauss_readout(dataloaders=data_loaders, seed=seed, **config)
-    print("Model with {} parameters.".format(get_model_parameters(model)))
-    return model
-
-
-def mtl_builder(data_loaders, seed: int = 1000, **config):
-    config = MTL.from_dict(config)
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-
-    from .mtl_vgg import MTL_VGG
-
-    model = MTL_VGG(
-        data_loaders,
-        vgg_type=config.vgg_type,
-        classification=config.classification,
-        classification_readout_type=config.classification_readout_type,
-        input_size=config.input_size,
-        num_classes=config.num_classes,
-        pretrained=config.pretrained,
-        v1_model_layer=config.v1_model_layer,
-        neural_input_channels=config.neural_input_channels,
-        classification_input_channels=config.classification_input_channels,
-        v1_fine_tune=config.v1_fine_tune,
-        v1_init_mu_range=config.v1_init_mu_range,
-        v1_init_sigma_range=config.v1_init_sigma_range,
-        v1_readout_bias=config.v1_readout_bias,
-        v1_bias=config.v1_bias,
-        v1_gamma_readout=config.v1_gamma_readout,
-        v1_elu_offset=config.v1_elu_offset,
-        v1_final_batchnorm=config.v1_final_batchnorm,
-    )
-
-    print("Model with {} parameters.".format(get_model_parameters(model)))
-    return model
-
-
-def classification_model_builder(data_loader, seed: int, **config):
-    config = Classification.from_dict(config)
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-    if "vgg" in config.type:
-        model = vgg_builder(seed, config)
-        from torchvision.models.vgg import model_urls
-    elif "resnet" in config.type:
-        model = resnet_builder(seed, config)
-        from torchvision.models.resnet import model_urls
-    elif "lenet" in config.type:
-        if "bayes" in config.type:
-            model = bayes_builder(seed, config)
-        elif "frcl" in config.type:
-            model = frcl_builder(seed, config)
-        else:
-            model = lenet_builder(seed, config)
-    else:
-        raise Exception("Unknown type {}".format(config.type))
-
-    if config.pretrained:
-        print("Downloading pretrained model:", flush=True)
-        url = (
-            model_urls[config.type]
-            if not config.pretrained_url
-            else config.pretrained_url
-        )
-        state_dict = load_state_dict_from_url(url, progress=True)
-        try:
-            load_state_dict(model, state_dict)
-        except:
-            load_state_dict(model, state_dict["model_state_dict"])
-
-    # Add wrappers
-    if config.get_intermediate_rep:
-        model = IntermediateLayerGetter(
-            model, return_layers=config.get_intermediate_rep, keep_output=True
-        )
-    if config.noise_adv_regression or config.noise_adv_classification:
-        assert not config.self_attention
-        model = NoiseAdvWrapper(
-            model,
-            input_size=model.fc.in_features
-            if "resnet" in config.type
-            else model.n_features,
-            hidden_size=model.fc.in_features if "resnet" in config.type else 4096,
-            classification=config.noise_adv_classification,
-            num_noise_readout_layers=config.num_noise_readout_layers,
-            sigmoid_output=config.noise_sigmoid_output,
-        )
-    print("Model with {} parameters.".format(get_model_parameters(model)))
-    if config.add_buffer:
-        for n, p in model.named_parameters():
-            if p.requires_grad:
-                n = n.replace(".", "__")
-                for b in config.add_buffer:
-                    model.register_buffer(
-                        f"{n}_{b}", p.detach().clone().zero_(),
-                    )
-    return model
-
-
-def regression_model_builder(data_loader, seed: int, **config):
-    config = Regression.from_dict(config)
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-
-    model = MLP(
-        input_size=config.input_size,
-        num_layers=config.num_layers,
-        layer_size=config.layer_size,
-        output_size=config.output_size,
-        activation=config.activation,
-        dropout=config.dropout,
-    )
-
-    # Add wrappers
-    if config.get_intermediate_rep:
-        model = IntermediateLayerGetter(
-            model, return_layers=config.get_intermediate_rep, keep_output=True
-        )
-
-    print("Model with {} parameters.".format(get_model_parameters(model)))
-    return model
diff --git a/bias_transfer/models/attention.py b/bias_transfer/models/attention.py
deleted file mode 100644
index 1b6ffb7..0000000
--- a/bias_transfer/models/attention.py
+++ /dev/null
@@ -1,216 +0,0 @@
-"""
-Implementation copied from https://github.com/leaderj1001/Stand-Alone-Self-Attention
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.nn.init as init
-
-import math
-
-
-class AttentionConv(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        groups=1,
-        bias=False,
-    ):
-        super(AttentionConv, self).__init__()
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.groups = groups
-
-        assert (
-            self.out_channels % self.groups == 0
-        ), "out_channels should be divided by groups. (example: out_channels: 40, groups: 4)"
-
-        self.rel_h = nn.Parameter(
-            torch.randn(out_channels // 2, 1, 1, kernel_size, 1), requires_grad=True
-        )
-        self.rel_w = nn.Parameter(
-            torch.randn(out_channels // 2, 1, 1, 1, kernel_size), requires_grad=True
-        )
-
-        self.key_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
-        self.query_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
-        self.value_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
-
-        self.reset_parameters()
-
-    def forward(self, x):
-        batch, channels, height, width = x.size()
-
-        padded_x = F.pad(x, [self.padding, self.padding, self.padding, self.padding])
-        q_out = self.query_conv(x)
-        k_out = self.key_conv(padded_x)
-        v_out = self.value_conv(padded_x)
-
-        k_out = k_out.unfold(2, self.kernel_size, self.stride).unfold(
-            3, self.kernel_size, self.stride
-        )
-        v_out = v_out.unfold(2, self.kernel_size, self.stride).unfold(
-            3, self.kernel_size, self.stride
-        )
-
-        k_out_h, k_out_w = k_out.split(self.out_channels // 2, dim=1)
-        k_out = torch.cat((k_out_h + self.rel_h, k_out_w + self.rel_w), dim=1)
-
-        k_out = k_out.contiguous().view(
-            batch, self.groups, self.out_channels // self.groups, height, width, -1
-        )
-        v_out = v_out.contiguous().view(
-            batch, self.groups, self.out_channels // self.groups, height, width, -1
-        )
-
-        q_out = q_out.view(
-            batch, self.groups, self.out_channels // self.groups, height, width, 1
-        )
-
-        out = q_out * k_out
-        out = F.softmax(out, dim=-1)
-        out = torch.einsum("bnchwk,bnchwk -> bnchw", out, v_out).view(
-            batch, -1, height, width
-        )
-
-        return out
-
-    def reset_parameters(self):
-        init.kaiming_normal_(self.key_conv.weight, mode="fan_out", nonlinearity="relu")
-        init.kaiming_normal_(
-            self.value_conv.weight, mode="fan_out", nonlinearity="relu"
-        )
-        init.kaiming_normal_(
-            self.query_conv.weight, mode="fan_out", nonlinearity="relu"
-        )
-
-        init.normal_(self.rel_h, 0, 1)
-        init.normal_(self.rel_w, 0, 1)
-
-
-class AttentionStem(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        groups=1,
-        m=4,
-        bias=False,
-    ):
-        super(AttentionStem, self).__init__()
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.groups = groups
-        self.m = m
-
-        assert (
-            self.out_channels % self.groups == 0
-        ), "out_channels should be divided by groups. (example: out_channels: 40, groups: 4)"
-
-        self.emb_a = nn.Parameter(
-            torch.randn(out_channels // groups, kernel_size), requires_grad=True
-        )
-        self.emb_b = nn.Parameter(
-            torch.randn(out_channels // groups, kernel_size), requires_grad=True
-        )
-        self.emb_mix = nn.Parameter(
-            torch.randn(m, out_channels // groups), requires_grad=True
-        )
-
-        self.key_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
-        self.query_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
-        self.value_conv = nn.ModuleList(
-            [
-                nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
-                for _ in range(m)
-            ]
-        )
-
-        self.reset_parameters()
-
-    def forward(self, x):
-        batch, channels, height, width = x.size()
-
-        padded_x = F.pad(x, [self.padding, self.padding, self.padding, self.padding])
-
-        q_out = self.query_conv(x)
-        k_out = self.key_conv(padded_x)
-        v_out = torch.stack(
-            [self.value_conv[_](padded_x) for _ in range(self.m)], dim=0
-        )
-
-        k_out = k_out.unfold(2, self.kernel_size, self.stride).unfold(
-            3, self.kernel_size, self.stride
-        )
-        v_out = v_out.unfold(3, self.kernel_size, self.stride).unfold(
-            4, self.kernel_size, self.stride
-        )
-
-        k_out = k_out[:, :, :height, :width, :, :]
-        v_out = v_out[:, :, :, :height, :width, :, :]
-
-        emb_logit_a = torch.einsum("mc,ca->ma", self.emb_mix, self.emb_a)
-        emb_logit_b = torch.einsum("mc,cb->mb", self.emb_mix, self.emb_b)
-        emb = emb_logit_a.unsqueeze(2) + emb_logit_b.unsqueeze(1)
-        emb = F.softmax(emb.view(self.m, -1), dim=0).view(
-            self.m, 1, 1, 1, 1, self.kernel_size, self.kernel_size
-        )
-
-        v_out = emb * v_out
-
-        k_out = k_out.contiguous().view(
-            batch, self.groups, self.out_channels // self.groups, height, width, -1
-        )
-        v_out = v_out.contiguous().view(
-            self.m,
-            batch,
-            self.groups,
-            self.out_channels // self.groups,
-            height,
-            width,
-            -1,
-        )
-        v_out = torch.sum(v_out, dim=0).view(
-            batch, self.groups, self.out_channels // self.groups, height, width, -1
-        )
-
-        q_out = q_out.view(
-            batch, self.groups, self.out_channels // self.groups, height, width, 1
-        )
-
-        out = q_out * k_out
-        out = F.softmax(out, dim=-1)
-        out = torch.einsum("bnchwk,bnchwk->bnchw", out, v_out).view(
-            batch, -1, height, width
-        )
-
-        return out
-
-    def reset_parameters(self):
-        init.kaiming_normal_(self.key_conv.weight, mode="fan_out", nonlinearity="relu")
-        init.kaiming_normal_(
-            self.query_conv.weight, mode="fan_out", nonlinearity="relu"
-        )
-        for _ in self.value_conv:
-            init.kaiming_normal_(_.weight, mode="fan_out", nonlinearity="relu")
-
-        init.normal_(self.emb_a, 0, 1)
-        init.normal_(self.emb_b, 0, 1)
-        init.normal_(self.emb_mix, 0, 1)
-
-
-# temp = torch.randn((2, 3, 32, 32))
-# conv = AttentionConv(3, 16, kernel_size=3, padding=1)
-# print(conv(temp).size())
diff --git a/bias_transfer/models/lenet_bayesian.py b/bias_transfer/models/lenet_bayesian.py
deleted file mode 100644
index a40c729..0000000
--- a/bias_transfer/models/lenet_bayesian.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import math
-from typing import OrderedDict, Union, Dict
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-
-from bias_transfer.models.utils import concatenate_flattened
-
-
-class BayesLinear(nn.Module):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        initial_posterior_var: float = 1e-3,
-        bias: bool = True,
-    ):
-        super(BayesLinear, self).__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.initial_posterior_var = initial_posterior_var
-        (
-            self.w_prior_mean,
-            self.w_prior_log_var,
-            self.w_posterior_mean,
-            self.w_posterior_log_var,
-        ) = self.create_parameter("weight", (out_features, in_features))
-        if bias:
-            (
-                self.b_prior_mean,
-                self.b_prior_log_var,
-                self.b_posterior_mean,
-                self.b_posterior_log_var,
-            ) = self.create_parameter("bias", (out_features,))
-        else:
-            self.register_parameter("b_posterior_mean", None)
-            self.register_parameter("b_posterior_log_var", None)
-        self.reset_parameters()
-
-    def create_parameter(self, name, dims):
-        prior_mean = torch.zeros(*dims)
-        prior_log_var = torch.zeros(*dims)
-        posterior_mean = nn.Parameter(torch.Tensor(*dims), requires_grad=True)
-        posterior_log_var = nn.Parameter(torch.Tensor(*dims), requires_grad=True)
-        # Finally, we register the prior and the posterior with the nn.Module.
-        # The prior values are registered as buffers, which indicates to PyTorch
-        # that they represent persistent state which should not be updated by
-        # the optimizer. The posteriors are registered as parameters, which on
-        # the other hand are to be modified by the optimizer.
-        self.register_buffer(f"{name}", prior_mean)  # to load with the right name
-        self.register_buffer(f"prior_{name}_log_var", prior_log_var)
-
-        return prior_mean, prior_log_var, posterior_mean, posterior_log_var
-
-    def reset_for_new_task(self):
-        """
-        Called after completion of a task, to reset state for the next task
-        """
-        # Set the value of the prior to be the current value of the posterior
-        self.w_prior_mean.data.copy_(self.w_posterior_mean.data)
-        self.b_prior_mean.data.copy_(self.b_posterior_mean.data)
-        self.w_prior_log_var.data.copy_(self.w_posterior_log_var.data)
-        self.b_prior_log_var.data.copy_(self.b_posterior_log_var.data)
-
-    def reset_parameters(self):
-        # Initialise the posterior means with a normal distribution. Note that
-        # prior to training we will run a procedure to optimise these values to
-        # point-estimates of the parameters for the first task.
-        torch.nn.init.normal_(self.w_posterior_mean, mean=0, std=0.1)
-        # Initialise the posterior variances with the given constant value.
-        torch.nn.init.constant_(
-            self.w_posterior_log_var, math.log(self.initial_posterior_var)
-        )
-        if self.bias is not None:
-            torch.nn.init.normal_(self.b_posterior_mean, mean=0, std=0.1)
-            # Initialise the posterior variances with the given constant value.
-            torch.nn.init.constant_(
-                self.b_posterior_log_var, math.log(self.initial_posterior_var)
-            )
-
-    @staticmethod
-    def _sample_parameters(w_mean, b_mean, w_log_var, b_log_var):
-        # sample weights and biases from normal distributions
-        w_epsilon = torch.randn_like(w_mean)
-        b_epsilon = torch.randn_like(b_mean)
-        sampled_weight = w_mean + w_epsilon * torch.exp(0.5 * w_log_var)
-        sampled_bias = b_mean + b_epsilon * torch.exp(0.5 * b_log_var)
-        return sampled_weight, sampled_bias
-
-    def forward(self, input):
-        sampled_weight, sampled_bias = self._sample_parameters(
-            self.w_posterior_mean,
-            self.b_posterior_mean,
-            self.w_posterior_log_var,
-            self.b_posterior_log_var,
-        )
-        return F.linear(input, sampled_weight, sampled_bias)
-
-
-class LeNet300100(nn.Module):
-    def __init__(
-        self,
-        num_classes: int = 10,
-        input_size: int = 28,
-        input_channels: int = 1,
-        dropout: float = 0.0,
-    ):
-        super(LeNet300100, self).__init__()
-        self.fc1 = BayesLinear(input_size * input_size * input_channels, 300)
-        self.fc2 = BayesLinear(300, 100)
-        self.fc3 = BayesLinear(100, num_classes)
-        self.dropout = nn.Dropout(p=dropout) if dropout else None
-        self.flat_input_size = input_size * input_size * input_channels
-
-    def forward(self, x, num_samples=1):
-        x = x.view(x.size(0), self.flat_input_size)
-        y = []
-        for s in range(num_samples):
-            z = F.relu(self.fc1(x))
-            z = self.dropout(z) if self.dropout else z
-            z = F.relu(self.fc2(z))
-            z = self.dropout(z) if self.dropout else z
-            y.append(self.fc3(z))
-        return torch.cat(y)
-
-    def get_parameters(self, name):
-        if "prior" in name:
-            return concatenate_flattened(
-                [
-                    self.fc1.__getattribute__(f"w_{name}"),
-                    self.fc2.__getattribute__(f"w_{name}"),
-                    self.fc3.__getattribute__(f"w_{name}"),
-                    self.fc1.__getattribute__(f"b_{name}"),
-                    self.fc2.__getattribute__(f"b_{name}"),
-                    self.fc3.__getattribute__(f"b_{name}"),
-                ]
-            )
-        else:
-            return concatenate_flattened(
-                [
-                    self.fc1._parameters.get(f"w_{name}"),
-                    self.fc2._parameters.get(f"w_{name}"),
-                    self.fc3._parameters.get(f"w_{name}"),
-                    self.fc1._parameters.get(f"b_{name}"),
-                    self.fc2._parameters.get(f"b_{name}"),
-                    self.fc3._parameters.get(f"b_{name}"),
-                ]
-            )
-
-    def to(self, *args, **kwargs):
-        """
-        Our prior tensors are registered as buffers but the way we access them
-        indirectly (through tuple attributes on the model) is causing problems
-        because when we use `.to()` to move the model to a new device, the prior
-        tensors get moved (because they're registered as buffers) but the
-        references in the tuples don't get updated to point to the new moved
-        tensors. This has no effect when running just on a cpu but breaks the
-        model when trying to run on a gpu. There are a million nicer ways of
-        working around this problem, but for now the easiest thing is to do
-        this: override the `.to()` method and manually update our references to
-        prior tensors.
-        """
-        self = super().to(*args, **kwargs)
-        for fc in [self.fc1, self.fc2, self.fc3]:
-            fc.w_prior_mean = fc.w_prior_mean.to(*args, **kwargs)
-            fc.w_prior_log_var = fc.w_prior_log_var.to(*args, **kwargs)
-            fc.b_prior_mean = fc.b_prior_mean.to(*args, **kwargs)
-            fc.b_prior_log_var = fc.b_prior_log_var.to(*args, **kwargs)
-        return self
-
-    def reset_for_new_task(self):
-        for fc in [self.fc1, self.fc2, self.fc3]:
-            fc.reset_for_new_task()
-
-
-def lenet_builder(seed: int, config):
-    if "5" in config.type:
-        lenet = LeNet5
-    elif "300-100" in config.type:
-        lenet = LeNet300100
-
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-    torch.cuda.manual_seed(seed)
-    model = lenet(
-        num_classes=config.num_classes,
-        input_size=config.input_size,
-        input_channels=config.input_channels,
-        dropout=config.dropout,
-    )
-    return model
diff --git a/bias_transfer/models/lenet_frcl.py b/bias_transfer/models/lenet_frcl.py
deleted file mode 100644
index 0eee0f1..0000000
--- a/bias_transfer/models/lenet_frcl.py
+++ /dev/null
@@ -1,295 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-from torch.distributions.multivariate_normal import MultivariateNormal
-
-
-class FRCL(nn.Module):
-    def __init__(
-        self,
-        input_size,
-        input_channels,
-        h_dim,
-        coreset_size,
-        num_classes: int = 10,
-        dropout: float = 0.0,
-        sigma_prior=1,
-        init_mu_std=1.0,
-    ):
-        """
-        Adapted from the implementation of https://github.com/AndreevP/FRCL
-        Args:
-            input_size:
-            input_channels:
-            h_dim:
-            coreset_size:
-            num_classes:
-            dropout:
-            sigma_prior:
-            init_mu_std:
-        """
-        super(FRCL, self).__init__()
-        self.num_classes = num_classes
-        self.dropout = nn.Dropout(p=dropout) if dropout else None
-
-        self.sigma_prior = sigma_prior
-        self.w_prior = MultivariateNormal(
-            torch.zeros(h_dim), covariance_matrix=sigma_prior * torch.eye(h_dim),
-        )
-        self.pred_func = nn.Softmax(dim=-1)
-        self.init_mu_std = init_mu_std
-
-        self.L = nn.ParameterList(
-            [
-                nn.Parameter(torch.zeros(h_dim, h_dim), requires_grad=True)
-                for _ in range(num_classes)
-            ]
-        )
-        self.mu = nn.ParameterList(
-            [
-                nn.Parameter(torch.zeros(h_dim), requires_grad=True,)
-                for _ in range(num_classes)
-            ]
-        )
-        self.mu_prev, self.cov_prev = [], []
-        for i in range(num_classes):
-            self.register_buffer(f"mu_prev_{i}", torch.zeros(coreset_size))
-            self.register_buffer(
-                f"cov_prev_{i}", torch.zeros(coreset_size, coreset_size)
-            )
-        self.register_buffer(
-            "coreset", torch.zeros(coreset_size, input_channels, input_size, input_size)
-        )
-        self.register_buffer(
-            "coreset_prev",
-            torch.zeros(coreset_size, input_channels, input_size, input_size),
-        )
-        self.device = self.coreset.device
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        for i in range(self.num_classes):
-            torch.nn.init.eye_(self.L[i])
-            torch.nn.init.normal_(self.mu[i], mean=0, std=self.init_mu_std)
-        # TODO reset the rest?
-
-    def to(self, *args, **kwargs):
-        self = super().to(*args, **kwargs)
-        for i in range(self.num_classes):
-            self._buffers[f"mu_prev_{i}"] = self._buffers[f"mu_prev_{i}"].to(
-                *args, **kwargs
-            )
-            self._buffers[f"cov_prev_{i}"] = self._buffers[f"cov_prev_{i}"].to(
-                *args, **kwargs
-            )
-        self.w_prior = MultivariateNormal(
-            self.w_prior.mean.to(*args, **kwargs),
-            covariance_matrix=self.w_prior.covariance_matrix.to(*args, **kwargs),
-        )
-        self.coreset = self.coreset.to(*args, **kwargs)
-        self.coreset_prev = self.coreset_prev.to(*args, **kwargs)
-        self.device = self.coreset.device
-        return self
-
-    def reset_for_new_task(self):
-        """
-        Called after completion of a task, to reset state for the next task
-        """
-        self.coreset_prev = self.coreset
-        phi_z = self.core_forward(self.coreset_prev)
-        for i in range(self.num_classes):
-            mu, cov = self._get_inducing_distribution(phi_z, i)
-            self._buffers[f"mu_prev_{i}"] = mu
-            self._buffers[f"cov_prev_{i}"] = cov
-
-    @property
-    def prev(self):
-        try:
-            return self._prev
-        except AttributeError:
-            self._prev = torch.any(self.coreset_prev != 0)
-            return self._prev
-
-    def forward(self, x, num_samples=8):
-        phi = self.core_forward(x)
-        if self.training:
-            return self._train_forward(phi, num_samples)
-        else:
-            return self._eval_forward(phi, num_samples)
-
-    def core_forward(self, x):
-        raise NotImplementedError()
-
-    def _train_forward(self, phi, num_samples):
-        """
-        Return -ELBO
-        N_k = len(dataset), required for unbiased estimate through minibatch
-        """
-        mu = self.mu
-        cov = [self.L[i] @ self.L[i].T for i in range(len(self.L))]
-        means = torch.stack([phi @ mu[i] for i in range(len(mu))], dim=1)
-        #  variances = torch.cat([((phi @ cov[i]) * phi).sum(-1) for i in range(len(cov))], axis = 0)
-        variances = torch.stack(
-            [torch.diagonal(phi @ cov[i] @ phi.T, 0) for i in range(len(cov))], dim=1
-        )
-        samples = torch.cat(
-            [
-                means
-                + torch.sqrt(variances + 1e-6)
-                * torch.randn(means.shape).to(self.device)
-                for i in range(num_samples)
-            ]
-        )
-        return samples
-
-    def _get_inducing_distribution(self, phi_z, i):
-        mu_u = phi_z @ self.mu[i]
-        L_u = phi_z @ self.L[i]
-        cov_u = L_u @ L_u.T
-        cov_u = cov_u + torch.eye(cov_u.shape[0]).to(self.device) * 1e-4
-        return mu_u, cov_u
-
-    def _get_predictive(self, phi_x):
-        """ Computes predictive distribution according to section 2.5
-            x - batch of data
-            k - index of task
-            Return predictive distribution q_\theta(f)
-        """
-        phi_z = self.core_forward(self.coreset)
-        k_xx = phi_x @ phi_x.T * self.sigma_prior
-        k_xz = phi_x @ phi_z.T * self.sigma_prior
-        k_zz = phi_z @ phi_z.T * self.sigma_prior
-        k_zz_ = torch.inverse(k_zz + torch.eye(phi_z.shape[0]).to(self.device) * 1e-3)
-
-        mu_u, cov_u = (
-            [None for _ in range(self.num_classes)],
-            [None for _ in range(self.num_classes)],
-        )
-        for i in range(self.num_classes):
-            mu_u[i], cov_u[i] = self._get_inducing_distribution(phi_z, i)
-
-        mu = [phi_x @ phi_z.T @ k_zz_ @ mu_u[i] for i in range(self.num_classes)]
-        sigma = [
-            k_xx
-            + (
-                k_xz
-                @ k_zz_
-                @ (cov_u[i] - k_zz + torch.eye(k_zz.shape[0]).to(self.device) * 1e-4)
-                @ k_zz_
-                @ k_xz.T
-            )
-            for i in range(self.num_classes)
-        ]
-        sigma = [
-            sigma[i] * torch.eye(sigma[i].shape[0]).to(self.device)
-            + torch.eye(sigma[i].shape[0]).to(self.device) * 1e-6
-            for i in range(self.num_classes)
-        ]
-        # print([s.min() for s in sigma])
-        sigma = [
-            torch.clamp(sigma[i], min=0, max=10000.0)
-            + torch.eye(sigma[i].shape[0]).to(self.device) * 1e-6
-            for i in range(self.num_classes)
-        ]
-        # we are interested only
-        # in diagonal part for inference ?
-        return [
-            MultivariateNormal(loc=mu[i], covariance_matrix=sigma[i])
-            for i in range(self.num_classes)
-        ]
-
-    def _eval_forward(self, phi, num_samples):
-        """
-        Compute p(y) by MC estimate from q_\theta(f)?
-        """
-        distr = self._get_predictive(phi)
-        # TODO: speedup possible if you precompute distr before an eval epoch (i.e. not recompute for each batch)
-        predicted = []
-        for _ in range(num_samples):
-            sample = [distr[i].sample() for i in range(self.num_classes)]
-            predicted.append(self.pred_func(torch.stack(sample, dim=1)))
-        return torch.cat(predicted)
-
-
-class LeNet5(
-    FRCL
-):  # adapted from https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html
-    def __init__(self, input_size: int = 28, input_channels: int = 1, *args, **kwargs):
-        super(LeNet5, self).__init__(
-            input_size=input_size,
-            input_channels=input_channels,
-            h_dim=84,
-            *args,
-            **kwargs,
-        )
-        conv_out_size = int(
-            ((((input_size - 3) + 1) / 2 - 3) + 1) / 2
-        )  # [(W-K+2P)/S]+1 / MP
-        self.flat_feature_size = (conv_out_size ** 2) * 16
-        # 1 input image channel, 6 output channels, 3x3 square convolution
-        # kernel
-        self.conv1 = nn.Conv2d(input_channels, 6, 3)
-        self.conv2 = nn.Conv2d(6, 16, 3)
-        # an affine operation: y = Wx + b
-        self.fc1 = nn.Linear(self.flat_feature_size, 120)
-        self.fc2 = nn.Linear(120, 84)
-
-    def core_forward(self, x):
-        x = F.relu(self.conv1(x))
-        x = self.dropout(x) if self.dropout else x
-        # Max pooling over a (2, 2) window
-        x = F.max_pool2d(x, (2, 2))
-        x = F.relu(self.conv2(x))
-        x = self.dropout(x) if self.dropout else x
-        # If the size is a square you can only specify a single number
-        x = F.max_pool2d(x, 2)
-        x = x.view(-1, self.flat_feature_size)
-        x = F.relu(self.fc1(x))
-        x = self.dropout(x) if self.dropout else x
-        x = F.relu(self.fc2(x))
-        x = self.dropout(x) if self.dropout else x
-        return x
-
-
-class LeNet300100(FRCL):
-    def __init__(self, input_size: int = 28, input_channels: int = 1, *args, **kwargs):
-        super(LeNet300100, self).__init__(
-            input_size=input_size,
-            input_channels=input_channels,
-            h_dim=100,
-            *args,
-            **kwargs,
-        )
-        self.fc1 = nn.Linear(input_size * input_size * input_channels, 300)
-        self.fc2 = nn.Linear(300, 100)
-        self.flat_input_size = input_size * input_size * input_channels
-
-    def core_forward(self, x):
-        x = x.view(x.size(0), self.flat_input_size)
-        x = F.relu(self.fc1(x))
-        x = self.dropout(x) if self.dropout else x
-        x = F.relu(self.fc2(x))
-        x = self.dropout(x) if self.dropout else x
-        return x
-
-
-def lenet_builder(seed: int, config):
-    if "5" in config.type:
-        lenet = LeNet5
-    elif "300-100" in config.type:
-        lenet = LeNet300100
-
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-    torch.cuda.manual_seed(seed)
-    model = lenet(
-        num_classes=config.num_classes,
-        input_size=config.input_size,
-        input_channels=config.input_channels,
-        dropout=config.dropout,
-        coreset_size=config.coreset_size,
-    )
-    return model
diff --git a/bias_transfer/models/mtl_vgg.py b/bias_transfer/models/mtl_vgg.py
deleted file mode 100644
index b9453b0..0000000
--- a/bias_transfer/models/mtl_vgg.py
+++ /dev/null
@@ -1,296 +0,0 @@
-import torchvision
-import torch.nn as nn
-from torch.autograd import Variable
-from neuralpredictors.layers.cores import Core2d
-import torch
-from nnfabrik.utility.nn_helpers import get_dims_for_loader_dict
-import numpy as np
-from torch.nn import functional as F
-from neuralpredictors.layers.legacy import Gaussian2d
-from neuralpredictors.training import eval_state
-from .vgg import create_vgg_readout
-
-VGG_TYPES = {
-    "vgg11": torchvision.models.vgg11,
-    "vgg11_bn": torchvision.models.vgg11_bn,
-    "vgg13": torchvision.models.vgg13,
-    "vgg13_bn": torchvision.models.vgg13_bn,
-    "vgg16": torchvision.models.vgg16,
-    "vgg16_bn": torchvision.models.vgg16_bn,
-    "vgg19_bn": torchvision.models.vgg19_bn,
-    "vgg19": torchvision.models.vgg19,
-}
-
-
-def get_module_output(model, input_shape):
-    """
-    Gets the output dimensions of the convolutional core
-        by passing an input image through all convolutional layers
-    :param core: convolutional core of the DNN, which final dimensions
-        need to be passed on to the readout layer
-    :param input_shape: the dimensions of the input
-    :return: output dimensions of the core
-    """
-    initial_device = "cuda" if next(iter(model.parameters())).is_cuda else "cpu"
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    with eval_state(model):
-        with torch.no_grad():
-            input = torch.zeros(1, *input_shape[1:]).to(device)
-            output = model.to(device)(input)
-    model.to(initial_device)
-
-    return output[0].shape
-
-
-class MultipleGaussian2d(torch.nn.ModuleDict):
-    def __init__(
-        self,
-        in_shapes,
-        n_neurons_dict,
-        init_mu_range,
-        init_sigma_range,
-        bias,
-        gamma_readout,
-    ):
-        # super init to get the _module attribute
-        super(MultipleGaussian2d, self).__init__()
-        for k in n_neurons_dict:
-            in_shape = in_shapes[k]
-            n_neurons = n_neurons_dict[k]
-            self.add_module(
-                k,
-                Gaussian2d(
-                    in_shape=in_shape,
-                    outdims=n_neurons,
-                    init_mu_range=init_mu_range,
-                    init_sigma_range=init_sigma_range,
-                    bias=bias,
-                ),
-            )
-        self.gamma_readout = gamma_readout
-
-    def forward(self, *args, data_key=None, **kwargs):
-        if data_key is None and len(self) == 1:
-            data_key = list(self.keys())[0]
-        return self[data_key](*args, **kwargs)
-
-    def regularizer(self, data_key):
-        return self[data_key].feature_l1(average=False) * self.gamma_readout
-
-
-class MTL_VGG_Core(Core2d, nn.Module):
-    def __init__(
-        self,
-        classification=True,
-        vgg_type="vgg19_bn",
-        pretrained=True,
-        v1_model_layer=17,
-        neural_input_channels=1,
-        classification_input_channels=1,
-        v1_fine_tune=False,
-        momentum=0.1,
-        v1_bias=True,
-        v1_final_batchnorm=False,
-        **kwargs
-    ):
-
-        super(MTL_VGG_Core, self).__init__()
-        self.v1_model_layer = v1_model_layer
-        self.neural_input_channels, self.classification_input_channels = (
-            neural_input_channels,
-            classification_input_channels,
-        )
-        self.v1_final_batchnorm = v1_final_batchnorm
-        self.classification = classification
-
-        # load convolutional part of vgg
-        assert vgg_type in VGG_TYPES, "Unknown vgg_type '{}'".format(vgg_type)
-        vgg_loader = VGG_TYPES[vgg_type]
-        vgg = vgg_loader(pretrained=pretrained)
-
-        self.shared_block = nn.Sequential(
-            *list(vgg.features.children())[:v1_model_layer]
-        )
-
-        # Remove the bias of the last conv layer if not bias:
-        if not v1_bias:
-            if "bias" in self.shared_block[-1]._parameters:
-                zeros = torch.zeros_like(self.shared_block[-1].bias)
-                self.shared_block[-1].bias.data = zeros
-
-        # Fix pretrained parameters during training parameters
-        if not v1_fine_tune:
-            for param in self.shared_block.parameters():
-                param.requires_grad = False
-
-        if v1_final_batchnorm:
-            self.v1_extra = nn.Sequential()
-            self.v1_extra.add_module(
-                "OutBatchNorm", nn.BatchNorm2d(self.outchannels, momentum=momentum)
-            )
-            self.v1_extra.add_module("OutNonlin", nn.ReLU(inplace=True))
-
-        if classification:
-            self.unshared_block = nn.Sequential(
-                *list(vgg.features.children())[v1_model_layer:]
-            )
-
-    def forward(self, x, classification=False):
-        if (classification and self.classification_input_channels == 1) or (
-            not classification and self.neural_input_channels == 1
-        ):
-            x = x.expand(-1, 3, -1, -1)
-        v1_core_out = shared_core_out = self.shared_block(x)
-        if self.v1_final_batchnorm:
-            v1_core_out = self.v1_extra(shared_core_out)
-        if classification:
-            core_out = self.unshared_block(shared_core_out)
-            return v1_core_out, core_out
-        return v1_core_out, None
-
-    @property
-    def outchannels(self):
-        """
-        Returns: dimensions of the output, after a forward pass through the model
-        """
-        found_out_channels = False
-        i = 1
-        while not found_out_channels:
-            if "out_channels" in self.shared_block[-i].__dict__:
-                found_out_channels = True
-            else:
-                i = i + 1
-        return self.shared_block[-i].out_channels
-
-
-class MTL_VGG(nn.Module):
-    def __init__(
-        self,
-        dataloaders,
-        vgg_type="vgg19_bn",
-        classification=False,
-        classification_readout_type=None,
-        input_size=None,
-        num_classes=200,
-        pretrained=True,
-        v1_model_layer=17,
-        neural_input_channels=1,
-        classification_input_channels=1,
-        v1_fine_tune=False,
-        v1_init_mu_range=0.4,
-        v1_init_sigma_range=0.6,
-        v1_readout_bias=True,
-        v1_bias=True,
-        v1_final_batchnorm=False,
-        v1_gamma_readout=0.002,
-        v1_elu_offset=-1,
-        **kwargs
-    ):
-
-        super(MTL_VGG, self).__init__()
-        self.classification_readout_type = classification_readout_type
-        self.input_size = input_size
-        self.num_classes = num_classes
-        self.v1_elu_offset = v1_elu_offset
-        self.neural_input_channels = neural_input_channels
-        self.classification_input_channels = classification_input_channels
-
-        # for neural dataloaders
-        if classification:
-            neural_train_dataloaders = {
-                k: loader
-                for k, loader in dataloaders["train"].items()
-                if k != "img_classification"
-            }
-        elif "train" in dataloaders.keys():
-            neural_train_dataloaders = dataloaders["train"]
-        else:
-            neural_train_dataloaders = dataloaders
-
-        session_shape_dict = get_dims_for_loader_dict(neural_train_dataloaders)
-        in_name, out_name = next(
-            iter(list(neural_train_dataloaders.values())[0])
-        )._fields
-        self.neural_input_channels = [
-            v[in_name][1] for v in session_shape_dict.values()
-        ]
-        assert (
-            np.unique(self.neural_input_channels).size == 1
-        ), "all input channels must be of equal size"
-
-        self.mtl_vgg_core = MTL_VGG_Core(
-            vgg_type=vgg_type,
-            classification=classification,
-            pretrained=pretrained,
-            v1_model_layer=v1_model_layer,
-            v1_fine_tune=v1_fine_tune,
-            neural_input_channels=self.neural_input_channels[0],
-            classification_input_channels=self.classification_input_channels,
-            v1_final_batchnorm=v1_final_batchnorm,
-            v1_bias=v1_bias,
-        )
-
-        n_neurons_dict = {k: v[out_name][1] for k, v in session_shape_dict.items()}
-        in_shapes_dict = {k: v[in_name] for k, v in session_shape_dict.items()}
-        in_shapes = {}
-        for k in n_neurons_dict:
-            in_shapes[k] = get_module_output(self.mtl_vgg_core, in_shapes_dict[k])[1:]
-
-        self.v1_readout = MultipleGaussian2d(
-            in_shapes=in_shapes,
-            n_neurons_dict=n_neurons_dict,
-            init_mu_range=v1_init_mu_range,
-            bias=v1_readout_bias,
-            init_sigma_range=v1_init_sigma_range,
-            gamma_readout=v1_gamma_readout,
-        )
-        if v1_readout_bias:
-            for key, value in neural_train_dataloaders.items():
-                _, targets = next(iter(value))
-                self.v1_readout[key].bias.data = targets.mean(0)
-
-        if classification:
-            # init fully connected part of vgg
-            test_input = Variable(torch.zeros(1, 3, input_size, input_size))
-            _, test_out = self.mtl_vgg_core(test_input, classification=True)
-            self.n_features = test_out.size(1) * test_out.size(2) * test_out.size(3)
-            self.classification_readout = create_vgg_readout(
-                classification_readout_type,
-                n_features=self.n_features,
-                num_classes=num_classes,
-            )
-            self._initialize_weights_classification_readout()
-
-    def forward(self, x, data_key=None, classification=False):
-        shared_core_out, core_out = self.mtl_vgg_core(x, classification)
-        if classification:
-            if self.classification_readout_type == "dense":
-                core_out = core_out.view(core_out.size(0), -1)
-            classification_out = self.classification_readout(core_out)
-            return classification_out
-        v1_out = self.v1_readout(shared_core_out, data_key=data_key)
-        v1_out = F.elu(v1_out + self.v1_elu_offset) + 1
-        return v1_out
-
-    def regularizer(self, data_key=None):
-        return self.v1_readout.regularizer(data_key=data_key)
-
-    def freeze(self, selection=("v1",)):
-        if selection is True or "v1" in selection:
-            for param in self.mtl_vgg_core.shared_block.parameters():
-                param.requires_grad = False
-
-
-    def _initialize_weights_classification_readout(self):
-        if self.mtl_vgg_core.classification:
-            for m in self.classification_readout:
-                if isinstance(m, nn.Conv2d):
-                    nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-                    if m.bias is not None:
-                        nn.init.constant_(m.bias, 0)
-                elif isinstance(m, nn.BatchNorm2d):
-                    nn.init.constant_(m.weight, 1)
-                    nn.init.constant_(m.bias, 0)
-                elif isinstance(m, nn.Linear):
-                    nn.init.normal_(m.weight, 0, 0.01)
-                    nn.init.constant_(m.bias, 0)
diff --git a/bias_transfer/models/resnet_self_attention.py b/bias_transfer/models/resnet_self_attention.py
deleted file mode 100644
index 82fae76..0000000
--- a/bias_transfer/models/resnet_self_attention.py
+++ /dev/null
@@ -1,123 +0,0 @@
-"""
-Implementation adapted from https://github.com/leaderj1001/Stand-Alone-Self-Attention
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .attention import AttentionConv, AttentionStem
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, in_channels, out_channels, stride=1, groups=1, base_width=64):
-        super(Bottleneck, self).__init__()
-        self.stride = stride
-        width = int(out_channels * (base_width / 64.0)) * groups
-
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_channels, width, kernel_size=1, bias=False),
-            nn.BatchNorm2d(width),
-            nn.ReLU(),
-        )
-        self.conv2 = nn.Sequential(
-            AttentionConv(width, width, kernel_size=7, padding=3, groups=8),
-            nn.BatchNorm2d(width),
-            nn.ReLU(),
-        )
-        self.conv3 = nn.Sequential(
-            nn.Conv2d(width, self.expansion * out_channels, kernel_size=1, bias=False),
-            nn.BatchNorm2d(self.expansion * out_channels),
-        )
-
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_channels != self.expansion * out_channels:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(
-                    in_channels,
-                    self.expansion * out_channels,
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False,
-                ),
-                nn.BatchNorm2d(self.expansion * out_channels),
-            )
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.conv2(out)
-        out = self.conv3(out)
-        if self.stride >= 2:
-            out = F.avg_pool2d(out, (self.stride, self.stride))
-
-        out += self.shortcut(x)
-        out = F.relu(out)
-
-        return out
-
-
-class ResNet(nn.Module):
-    def __init__(self, block, num_blocks, num_classes=1000, stem=False):
-        super(ResNet, self).__init__()
-        print("RESNET SELF-ATTENTION!!!")
-        self.in_places = 64
-
-        if stem:
-            self.init = nn.Sequential(
-                # CIFAR10
-                AttentionStem(
-                    in_channels=3,
-                    out_channels=64,
-                    kernel_size=4,
-                    stride=1,
-                    padding=2,
-                    groups=1,
-                ),
-                nn.BatchNorm2d(64),
-                nn.ReLU(),
-                # For ImageNet
-                # AttentionStem(in_channels=3, out_channels=64, kernel_size=4, stride=1, padding=2, groups=1),
-                # nn.BatchNorm2d(64),
-                # nn.ReLU(),
-                # nn.MaxPool2d(4, 4)
-            )
-        else:
-            self.init = nn.Sequential(
-                # CIFAR10
-                nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False),
-                nn.BatchNorm2d(64),
-                nn.ReLU(),
-                # For ImageNet
-                # nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
-                # nn.BatchNorm2d(64),
-                # nn.ReLU(),
-                # nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
-            )
-
-        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
-        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
-        self.readout = nn.Linear(512 * block.expansion, num_classes)
-
-    def _make_layer(self, block, planes, num_blocks, stride):
-        strides = [stride] + [1] * (num_blocks - 1)
-        layers = []
-        for stride in strides:
-            layers.append(block(self.in_places, planes, stride))
-            self.in_places = planes * block.expansion
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = self.init(x)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.avg_pool2d(out, 4)
-        core_out = out.view(out.size(0), -1)
-        out = self.readout(core_out)
-
-        return {"logits": out, "conv_rep": core_out}
diff --git a/bias_transfer/models/wrappers/noise_adv.py b/bias_transfer/models/wrappers/noise_adv.py
deleted file mode 100644
index 577898d..0000000
--- a/bias_transfer/models/wrappers/noise_adv.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch.nn as nn
-from torch.autograd import Function
-
-
-# Used the implementation from https://github.com/CuthbertCai/pytorch_DANN
-class GradReverse(Function):
-    @staticmethod
-    def forward(ctx, x, lambda_p):
-        ctx.constant = lambda_p
-        return x.view_as(x)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_output = grad_output.neg() * ctx.constant
-        return grad_output, None
-
-
-def grad_reverse(x, lambda_p):
-    return GradReverse.apply(x, lambda_p)
-
-
-class NoiseAdvWrapper(nn.Module):
-    def __init__(
-        self,
-        model,
-        input_size,
-        hidden_size,
-        classification: bool = False,
-        num_noise_readout_layers: int = 1,
-        sigmoid_output: bool = False,
-    ):
-        super().__init__()
-        self.model = model
-
-        noise_readout_layers = []
-        for i in range(0, num_noise_readout_layers):
-            in_size = input_size if i == 0 else hidden_size
-            out_size = 1 if i == num_noise_readout_layers - 1 else hidden_size
-            noise_readout_layers.append(nn.Linear(in_size, out_size))
-            if i < num_noise_readout_layers - 1:
-                noise_readout_layers.append(nn.ReLU())
-        self.noise_readout = nn.Sequential(*noise_readout_layers)
-        self.nonlinearity = nn.Sigmoid() if classification or sigmoid_output else nn.ReLU()
-
-    def forward(self, x, seed: int = None, noise_lambda=None):
-        extra_output, out = self.model(x)
-        core_out = extra_output["core"]
-        noise_out = self.noise_readout(grad_reverse(core_out, noise_lambda))
-        noise_out = self.nonlinearity(noise_out)
-        extra_output["noise_pred"] = noise_out
-        return extra_output, out
diff --git a/bias_transfer/run_tests.sh b/bias_transfer/run_tests.sh
deleted file mode 100755
index 63afdfe..0000000
--- a/bias_transfer/run_tests.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-pip install -e ../../ml-utils
-pip install -e ../../nnfabrik
-pip install -e ../../nnvision
-pip install -e ../../bias_transfer
-
-python -m unittest tests
diff --git a/bias_transfer/tables/evaluated_model.py b/bias_transfer/tables/evaluated_model.py
deleted file mode 100644
index 4dd1a9f..0000000
--- a/bias_transfer/tables/evaluated_model.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from .nnfabrik import *
-from nnfabrik.templates.trained_model import *
-from .trained_model import TrainedModel
-
-
-@schema
-class EvaluatedModel(TrainedModelBase):
-    table_comment = "Custom evaluation for trained models"
-
-    definition = """
-    -> TrainedModel
-    ---
-    ->[nullable] Fabrikant
-    score:                             float        # loss
-    output:                            longblob     # trainer object's output
-    evaluatedmodel_ts=CURRENT_TIMESTAMP: timestamp    # UTZ timestamp at time of insertion
-    """
-    ModelStorage = None
-
-    def get_full_config(self, key=None, include_state_dict=True, include_trainer=True):
-        """
-        Returns the full configuration dictionary needed to build all components of the network
-        training including dataset, model and trainer. The returned dictionary is designed to be
-        passed (with dictionary expansion) into the get_all_parts function provided in builder.py.
-
-        Args:
-            key - specific key against which to retrieve all configuration. The key must restrict all component
-                  tables into a single entry. If None, will assume that this table is already restricted and
-                  will obtain an existing single entry.
-            include_state_dict (bool) : If True, and if key refers to a model already trained with a corresponding entry in self.ModelStorage,
-                  the state_dict of the trained model is retrieved and returned
-            include_trainer (bool): If False, then trainer configuration is skipped. Usually desirable when you want to simply retrieve trained model.
-        """
-        if key is None:
-            key = self.fetch1("KEY")
-
-        model_fn, model_config = (self.model_table & key).fn_config
-        dataset_fn, dataset_config = (self.dataset_table & key).fn_config
-
-        ret = dict(
-            model_fn=model_fn,
-            model_config=model_config,
-            dataset_fn=dataset_fn,
-            dataset_config=dataset_config,
-        )
-
-        if include_trainer:
-            trainer_fn, trainer_config = (self.trainer_table & key).fn_config
-            ret["trainer_fn"] = trainer_fn
-            ret["trainer_config"] = trainer_config
-
-        # if trained model exist and include_state_dict is True
-        if include_state_dict and (TrainedModel.ModelStorage & key):
-            with tempfile.TemporaryDirectory() as temp_dir:
-                state_dict_path = (TrainedModel.ModelStorage & key).fetch1(
-                    "model_state", download_path=temp_dir
-                )
-                ret["state_dict"] = torch.load(state_dict_path)
-
-        return ret
-
-    def make(self, key):
-        """
-        Given key specifying configuration for dataloaders, model and trainer,
-        trains the model and saves the trained model.
-        """
-        # lookup the fabrikant corresponding to the current DJ user
-        fabrikant_name = Fabrikant.get_current_user()
-        seed = (Seed & key).fetch1("seed")
-
-        # load everything
-        dataloaders, model, trainer = self.load_model(
-            key, include_trainer=True, include_state_dict=True, seed=seed
-        )
-        # model = ((TrainedModel() & key).ModelStorage()).fetch1("model_state")
-
-        # define callback with pinging
-        def call_back(**kwargs):
-            self.connection.ping()
-            self.call_back(**kwargs)
-
-        # model training
-        score, output, model_state = trainer(
-            model, dataloaders, seed=seed, uid=key, cb=call_back, eval_only=True
-        )
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-            filename = make_hash(key) + ".pth.tar"
-            filepath = os.path.join(temp_dir, filename)
-            torch.save(model_state, filepath)
-
-            key["score"] = score
-            key["output"] = output
-            key["fabrikant_name"] = fabrikant_name
-            self.insert1(key)
diff --git a/bias_transfer/tests/__init__.py b/bias_transfer/tests/__init__.py
deleted file mode 100644
index b2909c1..0000000
--- a/bias_transfer/tests/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .test_training import *
-from .test_transfer import *
-from .test_model import *
-from .test_dataset import *
-from .test_dataset_filter import *
-from .test_lottery_ticket_pruning import *
diff --git a/bias_transfer/tests/_base.py b/bias_transfer/tests/_base.py
deleted file mode 100644
index 90e49cf..0000000
--- a/bias_transfer/tests/_base.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import os
-import unittest
-import copy
-
-import numpy as np
-import torch
-from torch.utils.data import SubsetRandomSampler
-from torch.utils.data.sampler import SequentialSampler
-
-import nnfabrik as nnf
-from bias_transfer.configs import dataset, model, trainer
-from bias_transfer.models.utils import weight_reset
-
-
-class BaseTest(unittest.TestCase):
-    dataset_conf = dataset.ImageDatasetConfig(
-        comment="Minimal CIFAR10",
-        dataset_cls="CIFAR10",
-        apply_data_normalization=False,
-        apply_data_augmentation=False,
-        add_corrupted_test=False,
-        valid_size=0.95,
-    )
-    model_conf = model.ClassificationModelConfig(
-        comment="CIFAR10 ResNet18", dataset_cls="CIFAR10", type="resnet18",
-        # advanced_init=True, zero_init_residual=True
-    )
-    seed = 42
-
-    @classmethod
-    def run_training(cls, trainer_conf):
-        uid = "test1"
-        path = "./checkpoint/ckpt.{}.pth".format(nnf.utility.dj_helpers.make_hash(uid))
-        if os.path.exists(path):
-            os.remove(path)
-        torch.manual_seed(cls.seed)
-        np.random.seed(cls.seed)
-        torch.cuda.manual_seed(cls.seed)
-        cls.model = copy.deepcopy(cls.start_model)
-
-        trainer_fn = nnf.builder.get_trainer(trainer_conf.fn, trainer_conf.to_dict())
-
-        def call_back(**kwargs):
-            pass
-
-        # model training
-        score, output, model_state = trainer_fn(
-            model=cls.model,
-            dataloaders=cls.data_loaders,
-            seed=cls.seed,
-            uid=uid,
-            cb=call_back,
-        )
-        return score
-
-    @classmethod
-    def get_parts(cls, dataset_conf, model_conf, seed):
-        os.chdir("/work/")
-        cls.data_loaders, cls.model = nnf.builder.get_all_parts(
-            dataset_fn=dataset_conf.fn,
-            dataset_config=dataset_conf.to_dict(),
-            model_fn=model_conf.fn,
-            model_config=model_conf.to_dict(),
-            seed=seed,
-            trainer_fn=None,
-            trainer_config=None,
-        )
-        cls.data_loaders["validation"] = cls.data_loaders["train"]
-        cls.data_loaders["test"] = cls.data_loaders["train"]
-        if "c_test" in cls.data_loaders:
-            category_1 = list(cls.data_loaders["c_test"].keys())[0]
-            cls.data_loaders["c_test"] = {
-                category_1: {1: cls.data_loaders["c_test"][category_1][1]}
-            }
-        cls.start_model = copy.deepcopy(cls.model)
-
-    @classmethod
-    def setUpClass(cls):  # called once before all methods of the class
-        cls.get_parts(cls.dataset_conf, cls.model_conf, cls.seed)
diff --git a/bias_transfer/tests/_main_loop_module.py b/bias_transfer/tests/_main_loop_module.py
deleted file mode 100644
index 7e83c4b..0000000
--- a/bias_transfer/tests/_main_loop_module.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import numpy as np
-import torch
-from torch import optim, nn
-from torch.backends import cudnn as cudnn
-from tqdm import tqdm
-
-from bias_transfer.tests._base import BaseTest
-from bias_transfer.trainer.utils import move_data
-
-
-class MainLoopModuleTest(BaseTest):
-    def pre_epoch_test(self, model, epoch):
-        pass
-
-    def pre_forward_test(self, model, inputs, shared_memory):
-        pass
-
-    def post_forward_test(self, outputs, loss, targets, module_losses, **kwargs):
-        pass
-
-    def post_backward_test(self, model):
-        pass
-
-    def main_loop(
-        self, model, data_loader, module, config, device, epoch: int = 0
-    ):
-        optimizer = getattr(optim, config.optimizer)(
-            model.parameters(), **config.optimizer_options
-        )
-        n_iterations = len(data_loader)
-        torch.manual_seed(self.seed)
-        np.random.seed(self.seed)
-        if device == "cuda":
-            cudnn.benchmark = False
-            cudnn.deterministic = True
-            torch.cuda.manual_seed(self.seed)
-        criterion = getattr(nn, config.loss_functions["img_classification"])()
-        model.train()
-        epoch_loss, correct, total, module_losses, collected_outputs = 0, 0, 0, {}, []
-        if hasattr(
-            tqdm, "_instances"
-        ):  # To have tqdm output without line-breaks between steps
-            tqdm._instances.clear()
-        with torch.enable_grad():
-            with tqdm(
-                enumerate(data_loader),
-                total=n_iterations,
-                desc="{} Epoch {}".format("Train", epoch),
-            ) as t:
-
-                module.pre_epoch(model, True, epoch)
-                self.pre_epoch_test(model, epoch)
-
-                optimizer.zero_grad()
-
-                for batch_idx, batch_data in t:
-                    # Pre-Forward
-                    loss = torch.zeros(1, device=device)
-                    inputs, targets, data_key, batch_dict = move_data(
-                        batch_data, device, False
-                    )
-                    shared_memory = {}  # e.g. to remember where which noise was applied
-                    model, inputs = module.pre_forward(model, inputs, shared_memory, True)
-                    self.pre_forward_test(model, inputs, shared_memory)
-                    # Forward
-                    outputs = model(inputs)
-                    # Post-Forward
-                    outputs, loss, targets = module.post_forward(outputs, loss, targets, module_losses, True,
-                                                                 **shared_memory)
-                    self.post_forward_test(
-                        outputs, loss, targets, module_losses, **shared_memory
-                    )
-                    loss += criterion(outputs, targets)
-                    epoch_loss += loss.item()
-
-                    # Book-keeping
-                    def average_loss(loss_):
-                        return loss_ / (batch_idx + 1)
-
-                    _, predicted = outputs.max(1)
-                    total += targets.size(0)
-                    correct += predicted.eq(targets).sum().item()
-                    eval = 100.0 * correct / total
-
-                    t.set_postfix(
-                        eval=eval,
-                        loss=average_loss(epoch_loss),
-                        **{k: average_loss(l) for k, l in module_losses.items()}
-                    )
-
-                    # Backward
-                    loss.backward()
-                    module.post_backward(model)
-                    self.post_backward_test(model)
-
-                    optimizer.step()
-                    optimizer.zero_grad()
-
-        return (
-            eval,
-            average_loss(epoch_loss),
-            {k: average_loss(l) for k, l in module_losses.items()},
-        )
diff --git a/bias_transfer/tests/test_dataset.py b/bias_transfer/tests/test_dataset.py
deleted file mode 100644
index d1ec804..0000000
--- a/bias_transfer/tests/test_dataset.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import unittest
-import os
-from bias_transfer.configs import trainer, model, dataset
-from bias_transfer.tests._base import BaseTest
-import nnfabrik as nnf
-
-
-class DatasetTest(BaseTest):
-    def test_cifar100(self):
-        print("===================================================", flush=True)
-        print("================TEST CIFAR100 Training=============", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR100", dataset_cls="CIFAR100", type="resnet18",
-        )
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal CIFAR100",
-            dataset_cls="CIFAR100",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=True,
-            valid_size=0.95,
-        )
-        self.get_parts(dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="CIFAR100 Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 9.92, places=1)
-
-    def test_tiny_imagenet(self):
-        print("===================================================", flush=True)
-        print("==============TEST Tiny-ImageNet Training==========", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="TinyImageNet", dataset_cls="TinyImageNet", type="resnet18",
-        )
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal TinyImageNet",
-            dataset_cls="TinyImageNet",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=True,
-            valid_size=0.95,
-        )
-        self.get_parts(dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="TinyImageNet Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 5.52, places=1)
-
-    def test_imagenet(self):
-        print("===================================================", flush=True)
-        print("=================TEST ImageNet Training============", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="ImageNet", dataset_cls="ImageNet", type="resnet18",
-        )
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal ImageNet",
-            dataset_cls="ImageNet",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=True,
-            batch_size=70,
-            valid_size=0.995,
-        )
-        self.get_parts(dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="ImageNet Training Test",
-            max_iter=1,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 0.9990633780830471, places=1)
-
-    def test_imagenet_pretrained(self):
-        print("===================================================", flush=True)
-        print("================TEST ImageNet Pretrained===========", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="ImageNet", dataset_cls="ImageNet", type="resnet50", pretrained=True
-        )
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal ImageNet",
-            dataset_cls="ImageNet",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=False,
-            valid_size=0.01,
-        )
-        self.data_loaders, self.model = nnf.builder.get_all_parts(
-            dataset_fn=dataset_conf.fn,
-            dataset_config=dataset_conf.to_dict(),
-            model_fn=model_conf.fn,
-            model_config=model_conf.to_dict(),
-            seed=self.seed,
-            trainer_fn=None,
-            trainer_config=None,
-        )
-        if "c_test" in self.data_loaders:
-            category_1 = list(self.data_loaders["c_test"].keys)[0]
-            self.data_loaders["c_test"] = {
-                category_1: {1: self.data_loaders["c_test"][category_1][1]}
-            }
-        trainer_conf = trainer.TrainerConfig(
-            comment="ImageNet Training Test",
-            max_iter=0,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        uid = "test1"
-        path = "./checkpoint/ckpt.{}.pth".format(nnf.utility.dj_helpers.make_hash(uid))
-        if os.path.exists(path):
-            os.remove(path)
-
-        trainer_fn = nnf.builder.get_trainer(trainer_conf.fn, trainer_conf.to_dict())
-
-        def call_back(**kwargs):
-            pass
-
-        # model training
-        score, output, model_state = trainer_fn(
-            model=self.model,
-            dataloaders=self.data_loaders,
-            seed=self.seed,
-            uid=uid,
-            cb=call_back,
-        )
-        self.assertAlmostEqual(score, 76.1, places=1)
-
-    def test_imagenet_pretrained_vgg(self):
-        print("===================================================", flush=True)
-        print("=============TEST ImageNet Pretrained (VGG)========", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="ImageNet", dataset_cls="ImageNet", type="vgg19_bn", pretrained=True
-        )
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal ImageNet",
-            dataset_cls="ImageNet",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=False,
-            valid_size=0.01,
-        )
-        self.data_loaders, self.model = nnf.builder.get_all_parts(
-            dataset_fn=dataset_conf.fn,
-            dataset_config=dataset_conf.to_dict(),
-            model_fn=model_conf.fn,
-            model_config=model_conf.to_dict(),
-            seed=self.seed,
-            trainer_fn=None,
-            trainer_config=None,
-        )
-        if "c_test" in self.data_loaders:
-            category_1 = list(self.data_loaders["c_test"].keys)[0]
-            self.data_loaders["c_test"] = {
-                category_1: {1: self.data_loaders["c_test"][category_1][1]}
-            }
-        trainer_conf = trainer.TrainerConfig(
-            comment="ImageNet Training Test",
-            max_iter=0,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-        )
-        uid = "test1"
-        path = "./checkpoint/ckpt.{}.pth".format(nnf.utility.dj_helpers.make_hash(uid))
-        if os.path.exists(path):
-            os.remove(path)
-
-        trainer_fn = nnf.builder.get_trainer(trainer_conf.fn, trainer_conf.to_dict())
-
-        def call_back(**kwargs):
-            pass
-
-        # model training
-        score, output, model_state = trainer_fn(
-            model=self.model,
-            dataloaders=self.data_loaders,
-            seed=self.seed,
-            uid=uid,
-            cb=call_back,
-        )
-        self.assertAlmostEqual(score, 74.24, places=1)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/bias_transfer/tests/test_dataset_filter.py b/bias_transfer/tests/test_dataset_filter.py
deleted file mode 100644
index a28c5f3..0000000
--- a/bias_transfer/tests/test_dataset_filter.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import unittest
-from bias_transfer.configs import dataset
-from bias_transfer.tests._base import BaseTest
-import nnfabrik as nnf
-
-
-class DatasetFilterTest(BaseTest):
-    def test_cifar100(self):
-        print("===================================================", flush=True)
-        print("=================TEST CIFAR100 Filter==============", flush=True)
-        start = 10
-        end = 90
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal CIFAR100",
-            dataset_cls="CIFAR100",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=True,
-            valid_size=0.00,
-            filter_classes=(start, end),
-            seed=42,
-        )
-        data_loaders = nnf.builder.get_data(dataset_conf.fn, dataset_conf.to_dict())
-        self.assertEqual(len(data_loaders["train"]["img_classification"].dataset), (end-start) * 500)
-        self.assertEqual(len(data_loaders["test"]["img_classification"].dataset), (end-start) * 100)
-        self.assertEqual(len(data_loaders["c_test"]["frost"][1].dataset), (end-start) * 100)
-
-
-    def test_cifar10(self):
-        print("===================================================", flush=True)
-        print("=================TEST CIFAR10 Filter===============", flush=True)
-        start = 2
-        end = 10
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal CIFAR10",
-            dataset_cls="CIFAR10",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=True,
-            valid_size=0.00,
-            filter_classes=(start, end),
-            seed=42,
-        )
-        data_loaders = nnf.builder.get_data(dataset_conf.fn, dataset_conf.to_dict())
-        self.assertEqual(len(data_loaders["train"]["img_classification"].dataset), (end-start) * 5000)
-        self.assertEqual(len(data_loaders["test"]["img_classification"].dataset), (end-start) * 1000)
-        self.assertEqual(len(data_loaders["c_test"]["speckle_noise"][5].dataset), (end-start) * 1000)
-
-
-    def test_tiny_imagenet(self):
-        print("===================================================", flush=True)
-        print("===============TEST TinyImageNet Filter============", flush=True)
-        start = 0
-        end = 150
-        dataset_conf = dataset.ImageDatasetConfig(
-            comment="Minimal TinyImageNet",
-            dataset_cls="TinyImageNet",
-            apply_data_normalization=True,
-            apply_data_augmentation=True,
-            add_corrupted_test=True,
-            valid_size=0.00,
-            filter_classes=(start, end),
-            seed=42,
-        )
-        data_loaders = nnf.builder.get_data(dataset_conf.fn, dataset_conf.to_dict())
-        self.assertEqual(len(data_loaders["train"]["img_classification"].dataset), (end-start) * 500)
-        self.assertEqual(len(data_loaders["test"]["img_classification"].dataset), (end-start) * 50)
-        self.assertEqual(len(data_loaders["c_test"]["snow"][5].dataset), (end-start) * 50)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/bias_transfer/tests/test_lottery_ticket_pruning.py b/bias_transfer/tests/test_lottery_ticket_pruning.py
deleted file mode 100644
index 9a4e6d8..0000000
--- a/bias_transfer/tests/test_lottery_ticket_pruning.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import unittest
-import torch
-
-from bias_transfer.configs import trainer
-from bias_transfer.tests._main_loop_module import MainLoopModuleTest
-from bias_transfer.trainer.main_loop_modules import LotteryTicketPruning
-from bias_transfer.models.utils import weight_reset
-
-
-class LotteryTicketPruningTest(MainLoopModuleTest):
-    def pre_epoch_test(self, model, epoch):
-        mask_sum = sum([torch.sum(m).cpu().detach().item() for m in self.module.mask])
-        p = (1 - (self.percent / 100)) ** (1 / self.rounds)
-        self.assertAlmostEqual(
-            mask_sum, self.total_parameters * p, places=-1
-        )  # remaining parameters
-
-    def post_backward_test(self, model):
-        step = 0
-        for name, p in model.named_parameters():
-            if "weight" in name:
-                grad_tensor = p.grad.data
-                grad_masked = (grad_tensor == 0).int()
-                self.assertTrue(
-                    torch.all((grad_masked + self.module.mask[step]) > 0).cpu().item()
-                )
-                step += 1
-
-    # def test_module(self):
-    #     print("===================================================", flush=True)
-    #     print("=====TEST the individual module components=========", flush=True)
-    #     self.rounds = 1
-    #     self.percent = 80
-    #     trainer_conf = trainer.TrainerConfig(
-    #         comment="Minimal Training Test",
-    #         max_iter=3,
-    #         verbose=False,
-    #         noise_test={"noise_snr": [], "noise_std": [],},
-    #         restore_best=False,
-    #         lr_milestones=(1, 2),
-    #         adaptive_lr=False,
-    #         patience=1000,
-    #         lottery_ticket={
-    #             "rounds": self.rounds,
-    #             "round_length": 1,
-    #             "percent_to_prune": self.percent,
-    #             "pruning": True,
-    #             "reinit": False,
-    #             "global_pruning": True,
-    #         },
-    #     )
-    #     self.model.apply(weight_reset)
-    #     device = "cuda" if torch.cuda.is_available() else "cpu"
-    #     self.model.to(device)
-    #     self.module = LotteryTicketPruning(
-    #         self.model, trainer_conf, device, self.data_loaders["train"], self.seed
-    #     )
-    #     mask_sum = sum([torch.sum(m) for m in self.module.mask])  # should all be one
-    #     self.total_parameters = 0
-    #     for name, param in self.model.named_parameters():
-    #         if "weight" in name:
-    #             size = 1
-    #             for l in list(param.size()):
-    #                 size *= l
-    #             self.total_parameters += size
-    #     self.assertEqual(mask_sum, self.total_parameters)
-    #     self.main_loop(
-    #         self.model,
-    #         self.data_loaders["train"],
-    #         self.module,
-    #         trainer_conf,
-    #         device=device,
-    #         epoch=2,
-    #     )
-
-    def test_training(self):
-        print("===================================================", flush=True)
-        print("===========TEST complete training==================", flush=True)
-        rounds = 2
-        round_length = 3
-        percent = 69
-        trainer_conf = trainer.TrainerConfig(
-            comment="Minimal Training Test",
-            max_iter=3,
-            verbose=False,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=None,
-            adaptive_lr=False,
-            patience=1000,
-            lottery_ticket={
-                "rounds": rounds,
-                "round_length": round_length,
-                "percent_to_prune": percent,
-                "pruning": True,
-                "reinit": False,
-                "global_pruning": True,
-            },
-        )
-        score = self.run_training(trainer_conf)
-        zero_parameters, total_parameters = 0, 0
-        for name, param in self.model.named_parameters():
-            if "weight" in name and "fc" not in name:
-                zero_parameters += torch.sum((param.data == 0).int()).item()
-                size = 1
-                for l in list(param.size()):
-                    size *= l
-                total_parameters += size
-        self.assertAlmostEqual(
-            zero_parameters, total_parameters * (percent / 100), places=-2
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/bias_transfer/tests/test_model.py b/bias_transfer/tests/test_model.py
deleted file mode 100644
index e46db1e..0000000
--- a/bias_transfer/tests/test_model.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import unittest
-from bias_transfer.configs import trainer, model, dataset
-from bias_transfer.tests._base import BaseTest
-
-
-class ModelTest(BaseTest):
-    def test_noise_adv_training(self):
-        print("===================================================", flush=True)
-        print("============TEST noise-adversarial training========", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="resnet18",
-            noise_adv_regression=True,
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Noise Adversarial Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_adv_regression=True,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 18.04, places=1)
-        self.setUpClass()
-
-    def test_noise_adv_training_vgg(self):
-        print("===================================================", flush=True)
-        print("========TEST noise-adversarial training (VGG)======", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="vgg19_bn",
-            noise_adv_regression=True,
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Noise Adversarial Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_adv_regression=True,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 16.92, places=1)
-        self.setUpClass()
-
-    def test_representation_matching(self):
-        print("===================================================", flush=True)
-        print("=======TEST representation matching training ======", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="resnet18",
-            representation_matching=True,
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Representation Matching Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            representation_matching={
-                "representation": "core",
-                "criterion": "cosine",
-                "second_noise_std": {(0, 1.0): 1.0},
-                "lambda": 1.0,
-            },
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 37.4, places=1)
-        self.setUpClass()
-
-    def test_representation_matching_vgg(self):
-        print("===================================================", flush=True)
-        print("====TEST representation matching training (VGG)====", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="vgg19_bn",
-            representation_matching=True,
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Representation Matching Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-            representation_matching={
-                "representation": "core",
-                "criterion": "cosine",
-                "second_noise_std": {(0, 1.0): 1.0},
-                "lambda": 1.0,
-            },
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 17.44, places=1)
-        self.setUpClass()
-
-    def test_resnet_50(self):
-        print("===================================================", flush=True)
-        print("================TEST ResNet50 Training=============", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10", dataset_cls="CIFAR10", type="resnet50",
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="ResNet50 Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 12.88, places=1)
-        # reset:
-        self.setUpClass()
-
-    def test_vgg_19(self):
-        print("===================================================", flush=True)
-        print("==================TEST VGG19 Training==============", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10", dataset_cls="CIFAR10", type="vgg19_bn",
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        trainer_conf = trainer.TrainerConfig(
-            comment="VGG19 Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 17.64, places=1)
-        # reset:
-        self.setUpClass()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/bias_transfer/tests/test_training.py b/bias_transfer/tests/test_training.py
deleted file mode 100644
index 619e307..0000000
--- a/bias_transfer/tests/test_training.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import unittest
-import torch
-import os
-import copy
-import numpy as np
-import nnfabrik as nnf
-from bias_transfer.configs import trainer
-from bias_transfer.models.utils import weight_reset
-from bias_transfer.tests._base import BaseTest
-
-
-class TrainingTest(BaseTest):
-    def test_training_adaptive_lr_schedule(self):
-        print("===================================================", flush=True)
-        print("=========TEST adaptive_lr training=================", flush=True)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Adaptive LR Training Test",
-            max_iter=3,
-            verbose=False,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=None,
-            adaptive_lr=True,
-            early_stop=True,
-            patience=2,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 65.08, places=1)
-
-    def test_training_fixed_lr_schedule(self):
-        print("===================================================", flush=True)
-        print("===========TEST fixed_lr training==================", flush=True)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Fixed LR Training Test",
-            max_iter=3,
-            verbose=False,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1, 2),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 79.72, places=1)
-
-    def test_training_noise_augment_std(self):
-        print("===================================================", flush=True)
-        print("==========TEST noise-augmented training STD =======", flush=True)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Noise Augmented Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 46.76, places=1)
-
-    def test_training_noise_augment_snr(self):
-        print("===================================================", flush=True)
-        print("===========TEST noise-augmented training SNR=======", flush=True)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Noise Augmented Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr={1.0: 0.25, 1.5: 0.25, -1: 0.5},
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(trainer_conf)
-        self.assertAlmostEqual(score, 40.16, places=1)
-
-    def test_freeze_params(self):
-        print("===================================================", flush=True)
-        print("=============TEST freeze params====================", flush=True)
-        trainer_conf = trainer.TrainerConfig(
-            comment="Fixed LR Training Test",
-            max_iter=2,
-            verbose=False,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            freeze=("readout",),
-            patience=1000,
-        )
-        torch.manual_seed(self.seed)
-        np.random.seed(self.seed)
-        torch.cuda.manual_seed(self.seed)
-        self.model.apply(weight_reset)
-        readout_weight_before = torch.clone(
-            dict(self.model.named_parameters())["fc.weight"].data
-        ).cpu()
-        readout_bias_before = torch.clone(
-            dict(self.model.named_parameters())["fc.bias"].data
-        ).cpu()
-        _ = self.run_training(trainer_conf)
-        readout_weight_after = dict(self.model.named_parameters())[
-            "fc.weight"
-        ].data.cpu()
-        readout_bias_after = dict(self.model.named_parameters())["fc.bias"].data.cpu()
-        self.assertTrue(
-            torch.all(torch.eq(readout_weight_before, readout_weight_after))
-        )
-        self.assertTrue(torch.all(torch.eq(readout_bias_before, readout_bias_after)))
-        self.setUpClass()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/bias_transfer/tests/test_transfer.py b/bias_transfer/tests/test_transfer.py
deleted file mode 100644
index 356be5d..0000000
--- a/bias_transfer/tests/test_transfer.py
+++ /dev/null
@@ -1,350 +0,0 @@
-import unittest
-import os
-import torch
-import numpy as np
-import nnfabrik as nnf
-from bias_transfer.configs import trainer, model
-from bias_transfer.models.utils import weight_reset
-from bias_transfer.tests._base import BaseTest
-
-
-class TransferTest(BaseTest):
-    def test_transfer_training(self):
-        self.setUpClass()
-        print("===================================================", flush=True)
-        print("============TEST transfer training=================", flush=True)
-        pretrained_path = "./checkpoint/ckpt.{}.pth".format(
-            nnf.utility.dj_helpers.make_hash("test1")
-        )
-        transfer_path = "./checkpoint/ckpt.to_transfer.pth"
-        if os.path.exists(transfer_path):
-            os.remove(transfer_path)
-        pretrain_trainer_conf = trainer.TrainerConfig(
-            comment="Noise Augmented Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(pretrain_trainer_conf)
-        self.assertAlmostEqual(score, 46.76, places=1)
-        state_dict = torch.load(pretrained_path)
-        torch.save(state_dict["net"],transfer_path)
-        transfer_trainer_conf = trainer.TrainerConfig(
-            comment="Transfer Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            transfer_from_path=transfer_path,
-            freeze=("core",),
-            reset_linear=True,
-        )
-        score = self.run_training(transfer_trainer_conf)
-        self.assertAlmostEqual(score, 38.64, places=1)
-
-    def test_transfer_rep_match_training(self):
-        self.setUpClass()
-        print("===================================================", flush=True)
-        print("========TEST transfer rep match training===========", flush=True)
-        pretrained_path = "./checkpoint/ckpt.{}.pth".format(
-            nnf.utility.dj_helpers.make_hash("test1")
-        )
-        transfer_path = "./checkpoint/ckpt.to_transfer.pth"
-        if os.path.exists(transfer_path):
-            os.remove(transfer_path)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="resnet18",
-            representation_matching=True,
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        pretrain_trainer_conf = trainer.TrainerConfig(
-            comment="Noise Augmented Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            representation_matching={
-                "representation": "core",
-                "criterion": "cosine",
-                "second_noise_std": {(0, 1.0): 1.0},
-                "lambda": 1.0,
-            },
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(pretrain_trainer_conf)
-        self.assertAlmostEqual(score, 37.4, places=1)
-        state_dict = torch.load(pretrained_path)
-        torch.save(state_dict["net"],transfer_path)
-        self.get_parts(self.dataset_conf, self.model_conf, self.seed)
-        transfer_trainer_conf = trainer.TrainerConfig(
-            comment="Transfer Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            transfer_from_path=transfer_path,
-            freeze=("core",),
-            reset_linear=True,
-        )
-        score = self.run_training(transfer_trainer_conf)
-        self.assertAlmostEqual(score, 35.48, places=1)
-        self.setUpClass()
-
-    def test_transfer_to_noise_aug_training(self):
-        self.setUpClass()
-        print("===================================================", flush=True)
-        print("=======TEST transfer to noise aug training=========", flush=True)
-        pretrained_path = "./checkpoint/ckpt.{}.pth".format(
-            nnf.utility.dj_helpers.make_hash("test1")
-        )
-        transfer_path = "./checkpoint/ckpt.to_transfer.pth"
-        if os.path.exists(transfer_path):
-            os.remove(transfer_path)
-        pretrain_trainer_conf = trainer.TrainerConfig(
-            comment="Clean Training Test",
-            max_iter=3,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-        )
-        score = self.run_training(pretrain_trainer_conf)
-        self.assertAlmostEqual(score, 66.36, places=1)
-        state_dict = torch.load(pretrained_path)
-        torch.save(state_dict["net"],transfer_path)
-        transfer_trainer_conf = trainer.TrainerConfig(
-            comment="Transfer Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            transfer_from_path=transfer_path,
-            freeze=("core",),
-            reset_linear=True,
-        )
-        score = self.run_training(transfer_trainer_conf)
-        self.assertAlmostEqual(score, 69.36, places=1)
-
-    def test_rdm_transfer_training(self):
-        print("===================================================", flush=True)
-        print("===========TEST RDM transfer training==============", flush=True)
-        pretrained_path = "./checkpoint/ckpt.{}.pth".format(
-            nnf.utility.dj_helpers.make_hash("test1")
-        )
-        transfer_path = "./checkpoint/ckpt.to_transfer.pth"
-        if os.path.exists(transfer_path):
-            os.remove(transfer_path)
-        pretrain_trainer_conf = trainer.TrainerConfig(
-            comment="Noise Augmented Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-        )
-        score = self.run_training(pretrain_trainer_conf)
-        self.assertAlmostEqual(score, 46.76, places=1)
-        state_dict = torch.load(pretrained_path)
-        torch.save(state_dict["net"],transfer_path)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="resnet18",
-            rdm_prediction=True
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        transfer_trainer_conf = trainer.TrainerConfig(
-            comment="RDM Transfer Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            transfer_from_path=transfer_path,
-            # freeze=("core",),
-            # reset_linear=True,
-            rdm_transfer=True,
-            rdm_prediction={"lambda": 1.0},
-        )
-        score = self.run_training(transfer_trainer_conf)
-        self.assertAlmostEqual(score, 27.24, places=1)
-        # reset model
-        self.setUpClass()
-
-    def test_transfer_training_vgg(self):
-        print("===================================================", flush=True)
-        print("=========TEST transfer training (VGG)==============", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="vgg19_bn",
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        pretrained_path = "./checkpoint/ckpt.{}.pth".format(
-            nnf.utility.dj_helpers.make_hash("test1")
-        )
-        transfer_path = "./checkpoint/ckpt.to_transfer.pth"
-        if os.path.exists(transfer_path):
-            os.remove(transfer_path)
-        pretrain_trainer_conf = trainer.TrainerConfig(
-            comment="Noise Augmented Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            early_stop=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            patience=1000,
-            readout_name="classifier"
-        )
-        score = self.run_training(pretrain_trainer_conf)
-        self.assertAlmostEqual(score, 17.92, places=1)
-        state_dict = torch.load(pretrained_path)
-        torch.save(state_dict["net"],transfer_path)
-        transfer_trainer_conf = trainer.TrainerConfig(
-            comment="Transfer Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            transfer_from_path=transfer_path,
-            freeze=("core",),
-            reset_linear=True,
-            readout_name="classifier"
-        )
-        score = self.run_training(transfer_trainer_conf)
-        self.assertAlmostEqual(score, 29.2, places=1)
-        self.setUpClass()
-
-    def test_rdm_transfer_training_vgg(self):
-        print("===================================================", flush=True)
-        print("=======TEST RDM transfer training (VGG)============", flush=True)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="vgg19_bn",
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        pretrained_path = "./checkpoint/ckpt.{}.pth".format(
-            nnf.utility.dj_helpers.make_hash("test1")
-        )
-        transfer_path = "./checkpoint/ckpt.to_transfer.pth"
-        if os.path.exists(transfer_path):
-            os.remove(transfer_path)
-        pretrain_trainer_conf = trainer.TrainerConfig(
-            comment="Noise Augmented Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=True,
-            noise_snr=None,
-            noise_std={0.08: 0.1, 0.12: 0.1, 0.18: 0.1, 0.26: 0.1, 0.38: 0.1, -1: 0.5,},
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            readout_name="classifier"
-        )
-        score = self.run_training(pretrain_trainer_conf)
-        self.assertAlmostEqual(score, 17.92, places=1)
-        state_dict = torch.load(pretrained_path)
-        torch.save(state_dict["net"],transfer_path)
-        model_conf = model.ClassificationModelConfig(
-            comment="CIFAR10",
-            dataset_cls="CIFAR10",
-            type="vgg19_bn",
-            rdm_prediction=True
-        )
-        self.get_parts(self.dataset_conf, model_conf, self.seed)
-        transfer_trainer_conf = trainer.TrainerConfig(
-            comment="RDM Transfer Training Test",
-            max_iter=2,
-            verbose=False,
-            add_noise=False,
-            noise_snr=None,
-            noise_std=None,
-            noise_test={"noise_snr": [], "noise_std": [],},
-            restore_best=False,
-            lr_milestones=(1,),
-            adaptive_lr=False,
-            early_stop=False,
-            patience=1000,
-            transfer_from_path=transfer_path,
-            rdm_transfer=True,
-            rdm_prediction={"lambda": 1.0},
-            readout_name="classifier"
-        )
-        score = self.run_training(transfer_trainer_conf)
-        self.assertAlmostEqual(score, 20.52, places=1)
-        # reset model
-        self.setUpClass()
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/bias_transfer/trainer/__init__.py b/bias_transfer/trainer/__init__.py
deleted file mode 100644
index 62206d4..0000000
--- a/bias_transfer/trainer/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .img_classification_trainer import trainer as img_classification
-from .neural_trainer import trainer as neural
-from .regression_trainer import trainer as regression
-from bias_transfer.trainer.transfer import trainer as transfer
-from bias_transfer.trainer.transfer import regression_trainer as regression_transfer
diff --git a/bias_transfer/trainer/img_classification_trainer.py b/bias_transfer/trainer/img_classification_trainer.py
deleted file mode 100644
index 3f41084..0000000
--- a/bias_transfer/trainer/img_classification_trainer.py
+++ /dev/null
@@ -1,219 +0,0 @@
-from functools import partial
-
-from bias_transfer.trainer.utils.checkpointing import (
-    RemoteCheckpointing,
-    LocalCheckpointing,
-)
-from bias_transfer.trainer.trainer import Trainer
-from bias_transfer.trainer.utils import get_subdict, stringify
-from bias_transfer.trainer.utils.loss import *
-from neuralpredictors.tracking import AdvancedMultipleObjectiveTracker
-
-from torch import nn, optim
-
-
-def trainer(model, dataloaders, seed, uid, cb, eval_only=False, **kwargs):
-    t = ImgClassificationTrainer(dataloaders, model, seed, uid, cb, **kwargs)
-    return t.train()
-
-
-class ImgClassificationTrainer(Trainer):
-    checkpointing_cls = LocalCheckpointing
-
-    @property
-    def tracker(self):
-        try:
-            return self._tracker
-        except AttributeError:
-            objectives = {
-                "LR": 0,
-                "Training": {
-                    "img_classification": {"loss": 0, "accuracy": 0, "normalization": 0}
-                },
-                "Validation": {
-                    "img_classification": {
-                        "loss": 0,
-                        "accuracy": 0,
-                        "normalization": 0,
-                    },
-                    "patience": 0,
-                },
-            }
-            self._tracker = AdvancedMultipleObjectiveTracker(
-                main_objective=("img_classification", "accuracy"), **objectives
-            )
-            return self._tracker
-
-    def get_training_controls(self):
-        criterion, stop_closure = {}, {}
-        for k in self.task_keys:
-            if k == "transfer" or k not in self.config.loss_functions:
-                continue  # no validation on this data and training is handled in mainloop modules
-            criterion[k] = (
-                globals().get(self.config.loss_functions[k])
-                or getattr(nn, self.config.loss_functions[k])
-            )()
-
-            stop_closure[k] = partial(
-                self.main_loop,
-                data_loader=get_subdict(self.data_loaders["validation"], [k]),
-                mode="Validation",
-                epoch=0,
-                cycler_args={},
-                cycler="LongCycler",
-            )
-        optimizer = getattr(optim, self.config.optimizer)(
-            self.model.parameters(), **self.config.optimizer_options
-        )
-        return optimizer, stop_closure, criterion
-
-    def move_data(self, batch_data):
-        data_key, inputs, targets = batch_data[0], batch_data[1][0], batch_data[1][1]
-
-        # targets
-        if isinstance(targets, dict):
-            targets = {k: t.to(self.device) for k, t in targets.items()}
-            if len(targets) == 1 and data_key != "transfer":
-                targets = next(iter(targets.values()))
-        else:
-            targets = targets.to(self.device)
-
-        # inputs
-        if (
-            isinstance(inputs, dict) and len(inputs) == 1
-        ):  # TODO add support for multiple inputs
-            inputs = next(iter(inputs.values()))
-        inputs = inputs.to(self.device, dtype=torch.float)
-
-        return inputs, targets, data_key, None
-
-    def compute_loss(
-        self,
-        mode,
-        task_key,
-        loss,
-        outputs,
-        targets,
-    ):
-        if task_key != "transfer" and task_key in self.config.loss_functions:
-            if not (
-                self.config.regularization
-                and self.config.regularization.get("regularizer") == "Mixup"
-            ):  # otherwise this is done in the mainloop-module
-                loss += self.criterion[task_key](outputs, targets)
-                _, predicted = outputs.max(1)
-                self.tracker.log_objective(
-                    100 * predicted.eq(targets).sum().item(),
-                    keys=(mode, task_key, "accuracy"),
-                )
-            batch_size = targets.size(0)
-            self.tracker.log_objective(
-                batch_size,
-                keys=(mode, task_key, "normalization"),
-            )
-            self.tracker.log_objective(
-                loss.item() * batch_size,
-                keys=(mode, task_key, "loss"),
-            )
-        return loss
-
-    def test_final_model(self, epoch, bn_train=""):
-        deactivate_options = {
-            "noise_snr": None,
-            "noise_std": None,
-            "rep_matching": False,
-            "rep_monitoring": False,
-            "noise_adv": False,
-        }
-        if not bn_train and self.config.eval_with_bn_train:
-            self.test_final_model(epoch, bn_train=" BN=Train")
-        # test the final model with noise on the dev-set
-        # test the final model on the test set
-        for k in self.task_keys:
-            if k == "transfer":
-                continue
-            if "rep_matching" not in k and self.config.noise_test:
-                for n_type, n_vals in self.config.noise_test.items():
-                    for val in n_vals:
-                        val_str = stringify(val)
-                        mode = "Noise {} {}".format(n_type, val_str) + bn_train
-                        objectives = {
-                            mode: {
-                                k: {
-                                    "accuracy": 0,
-                                    "loss": 0,
-                                    "normalization": 0,
-                                }
-                            }
-                        }
-                        self.tracker.add_objectives(objectives, init_epoch=True)
-
-                        module_options = deactivate_options.copy()
-                        module_options[n_type] = val
-                        self.main_loop(
-                            epoch=epoch,
-                            data_loader=get_subdict(
-                                self.data_loaders["validation"], [k]
-                            ),
-                            mode=mode,
-                            cycler_args={},
-                            cycler="LongCycler",
-                            module_options=module_options,
-                        )
-
-            objectives = {
-                "Test"
-                + bn_train: {
-                    k: {
-                        "accuracy": 0,
-                        "loss": 0,
-                        "normalization": 0,
-                    }
-                }
-            }
-            self.tracker.add_objectives(objectives, init_epoch=True)
-            test_result = self.main_loop(
-                epoch=epoch,
-                data_loader=get_subdict(self.data_loaders["test"], [k]),
-                mode="Test" + bn_train,
-                cycler_args={},
-                cycler="LongCycler",
-                module_options=deactivate_options,
-            )
-        if "c_test" in self.data_loaders:
-            for k in self.task_keys:
-                if "rep_matching" not in k:
-                    for c_category in list(self.data_loaders["c_test"][k].keys()):
-                        for c_level, data_loader in self.data_loaders["c_test"][k][
-                            c_category
-                        ].items():
-
-                            objectives = {
-                                c_category
-                                + bn_train: {
-                                    str(c_level): {
-                                        "accuracy": 0,
-                                        "loss": 0,
-                                        "normalization": 0,
-                                    }
-                                }
-                            }
-                            self.tracker.add_objectives(objectives, init_epoch=True)
-                            self.main_loop(
-                                epoch=epoch,
-                                data_loader={str(c_level): data_loader},
-                                mode=c_category + bn_train,
-                                cycler_args={},
-                                cycler="LongCycler",
-                                module_options=deactivate_options,
-                            )
-        if "st_test" in self.data_loaders:
-            self.main_loop(
-                epoch=epoch,
-                data_loader={"img_classification": self.data_loaders["st_test"]},
-                mode="Test-ST" + bn_train,
-                cycler_args={},
-                cycler="LongCycler",
-                module_options=deactivate_options,
-            )
-        return test_result
diff --git a/bias_transfer/trainer/main_loop_modules/__init__.py b/bias_transfer/trainer/main_loop_modules/__init__.py
deleted file mode 100644
index e9e43d4..0000000
--- a/bias_transfer/trainer/main_loop_modules/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from .noise_adv_training import NoiseAdvTraining
-from .noise_augmentation import NoiseAugmentation
-from .random_readout_reset import RandomReadoutReset
-from .representation_matching import RepresentationMatching
-from .representation_monitor import RepresentationMonitor
-from .lottery_ticket_pruning import LotteryTicketPruning
-from .model_wrapper import ModelWrapper
-from .synaptic_intelligence import SynapticIntelligence
-from .fisher_estimation import FisherEstimation
-from .parameter_regularization.param_distance import ParamDistance
-from .parameter_regularization.mixup import Mixup
-from .parameter_regularization.vcl import VCL
-from .representation_regularization.rdl import RDL
-from .representation_regularization.knowledge_distillation import KnowledgeDistillation
-from .function_regularization.frcl import FRCL
-from .function_regularization.fromp import FROMP
diff --git a/bias_transfer/trainer/main_loop_modules/fisher_estimation.py b/bias_transfer/trainer/main_loop_modules/fisher_estimation.py
deleted file mode 100644
index a17137d..0000000
--- a/bias_transfer/trainer/main_loop_modules/fisher_estimation.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from .main_loop_module import MainLoopModule
-import torch.nn.functional as F
-
-
-class FisherEstimation(MainLoopModule):
-    """
-    Implementation adapted from https://github.com/GMvandeVen/continual-learning/blob/master/continual_learner.py
-    """
-
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        self.num_samples = self.config.compute_fisher.get("num_samples", 128)
-        self.empirical = self.config.compute_fisher.get("empirical", False)
-        self.est_fisher_info = {}
-
-    def pre_epoch(self, model, mode, **options):
-        super().pre_epoch(model, mode, **options)
-        # Prepare <dict> to store estimated Fisher Information matrix
-        for n, p in model.named_parameters():
-            if p.requires_grad:
-                n = n.replace(".", "__")
-                self.est_fisher_info[n] = p.detach().clone().zero_()
-
-    def post_forward(self, outputs, loss, targets, **shared_memory):
-        model = self.trainer.model
-        if self.empirical:
-            # use provided label to calculate loglikelihood --> "empirical Fisher":
-            label = targets
-        else:
-            # use predicted label to calculate loglikelihood:
-            label = outputs.max(1)[1]
-        # calculate negative log-likelihood
-        loss = self.trainer.criterion[self.task_key](outputs, label)
-        # loss = F.nll_loss(F.log_softmax(outputs, dim=1), label)
-
-        # Calculate gradient of negative loglikelihood
-        model.zero_grad()
-        loss.backward()
-
-        # Square gradients and keep running sum
-        for n, p in model.named_parameters():
-            if p.requires_grad:
-                n = n.replace(".", "__")
-                if p.grad is not None:
-                    self.est_fisher_info[n] += p.grad.detach() ** 2
-
-        return outputs, loss, targets
-
-    def post_epoch(self, model):
-        # Normalize by sample size used for estimation
-        est_fisher_info = {
-            n: p / self.num_samples for n, p in self.est_fisher_info.items()
-        }
-
-        # Store new values in the network
-        for n, p in model.named_parameters():
-            if p.requires_grad:
-                n = n.replace(".", "__")
-                # precision (approximated by diagonal Fisher Information matrix)
-                model.register_buffer(
-                    f"{n}_importance", est_fisher_info[n],
-                )
diff --git a/bias_transfer/trainer/main_loop_modules/function_regularization/__init__.py b/bias_transfer/trainer/main_loop_modules/function_regularization/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/bias_transfer/trainer/main_loop_modules/function_regularization/frcl.py b/bias_transfer/trainer/main_loop_modules/function_regularization/frcl.py
deleted file mode 100644
index b9b3648..0000000
--- a/bias_transfer/trainer/main_loop_modules/function_regularization/frcl.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from functools import partial
-
-import torch
-
-from bias_transfer.trainer.main_loop_modules.main_loop_module import MainLoopModule
-
-
-class FRCL(MainLoopModule):
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        self.eps = self.config.regularization.get("eps", 1e-8)
-        self.num_samples = self.config.regularization.get("num_samples", 9)
-        self.train_len = len(
-            self.trainer.data_loaders["train"]["img_classification"].dataset
-        )
-
-    def pre_forward(self, model, inputs, task_key, shared_memory):
-        super().pre_forward(model, inputs, task_key, shared_memory)
-        model_ = partial(model, num_samples=self.num_samples)
-        return model_, inputs
-
-    def post_forward(self, outputs, loss, targets, **shared_memory):
-        if self.train_mode:
-            loss += self._calculate_kl_term() / self.train_len
-        targets = targets.repeat(self.num_samples).view(-1)
-        return outputs, loss, targets
-
-    @staticmethod
-    def kl(m1, S1, m2, S2):
-        S2 = S2 + torch.eye(S2.shape[0]).to(S2) * 1e-3
-        S1 = S1 + torch.eye(S1.shape[0]).to(S1) * 1e-3
-        S2_ = torch.inverse(S2)
-        return 0.5 * (
-            torch.trace(S2_ @ S1)
-            + (m2 - m1).T @ S2_ @ (m2 - m1)
-            - S1.shape[0]
-            + torch.logdet(S2)
-            - torch.logdet(S1)
-        )
-
-    def _calculate_kl_term(self):
-        model = self.trainer.model
-        kls = 0
-        for i in range(model.num_classes):
-            # kls -= kl_divergence(self.w_distr[i], self.w_prior)
-            kls -= self.kl(
-                model.mu[i],
-                model.L[i] @ model.L[i].T,
-                model.w_prior.mean,
-                model.w_prior.covariance_matrix,
-            )
-            # curr_task_kls = -kls.item()
-
-        if model.prev:
-            out_dim = model.num_classes  # model.prev_num_classes
-            phi_i = model.core_forward(model.coreset_prev)
-            cov_i = phi_i @ phi_i.T + torch.eye(phi_i.shape[0]).to(self.device) * 1e-6
-            # p_u = MultivariateNormal(torch.zeros(cov_i.shape[0]).to(self.device),
-            #                          covariance_matrix=cov_i * self.sigma_prior)
-            # kls -= sum([kl_divergence(self.prev_tasks_distr[i][j], p_u) for j in range(self.out_dim)])
-            prev_kls = sum(
-                [
-                    self.kl(
-                        model._buffers[f"mu_prev_{j}"],
-                        model._buffers[f"cov_prev_{j}"],
-                        torch.zeros(cov_i.shape[0]).to(self.device),
-                        cov_i * model.sigma_prior,
-                    )
-                    for j in range(out_dim)
-                ]
-            )
-            # if state is not None:
-            #     state.kls.append(prev_kls.item())
-            kls -= prev_kls
-
-        # if state is not None:
-        #     state.kls.append(curr_task_kls)
-        #     state.kls_div_nk = kls.item() / N_k
-        # Sum KL over all parameters
-        return -kls
diff --git a/bias_transfer/trainer/main_loop_modules/function_regularization/fromp.py b/bias_transfer/trainer/main_loop_modules/function_regularization/fromp.py
deleted file mode 100644
index 4e88885..0000000
--- a/bias_transfer/trainer/main_loop_modules/function_regularization/fromp.py
+++ /dev/null
@@ -1,274 +0,0 @@
-from torch import nn
-import torch
-import torch.nn.functional as F
-
-from bias_transfer.trainer.main_loop_modules.function_regularization.fromp_utils import (
-    update_input,
-    logistic_hessian,
-    full_softmax_hessian,
-    parameters_to_matrix,
-    parameter_grads_to_vector,
-    vector_to_parameter_grads,
-)
-from bias_transfer.trainer.main_loop_modules.main_loop_module import MainLoopModule
-from neuralpredictors.training import eval_state
-
-
-class FROMP(MainLoopModule):
-    """
-    Adapted from https://github.com/team-approx-bayes/fromp
-    """
-
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        self.prior_prec = self.config.regularization.get("prior_prec")
-        self.grad_clip_norm = self.config.regularization.get("grad_clip_norm")
-        self.alpha = self.config.regularization.get("alpha")
-        if self.prior_prec < 0.0:
-            raise ValueError(f"invalid prior precision: {self.prior_prec}")
-        if (self.grad_clip_norm is not None) and (not self.grad_clip_norm >= 0.0):
-            raise ValueError(f"invalid gradient clip norm: {self.grad_clip_norm}")
-        if self.alpha < 0.0:
-            raise ValueError(f"invalid alpha: {self.alpha}")
-        self.covariance = torch.tensor(
-            self.trainer.data_loaders.pop("covariance"), device=self.device
-        )
-        main_task = next(iter(self.trainer.data_loaders["train"].keys()))
-        self.memorable_points_prev = (
-            self.trainer.data_loaders["train"].pop(f"{main_task}_cs").dataset.samples
-        )
-        self.model = self.trainer.model
-        self.optimizer = self.trainer.optimizer
-        self.train_modules = []
-        self.set_train_modules(self.model, self.train_modules)
-
-        self.init_task(self.config.regularization.get("eps", 1e-5))
-
-    def init_task(self, eps):
-        """
-        Calculate values (memorable_logits, hkh_l) for regularisation term (all but the first task)
-
-        """
-        self.kernel_inv_prev_mem_prev_model = []
-        covariance = 1.0 / (self.covariance + self.prior_prec)
-
-        with eval_state(self.model):
-            memorable_data_prev = self.memorable_points_prev.to(self.device)
-            self.optimizer.zero_grad()
-            logits_prev_mem = self.model.forward(memorable_data_prev)
-
-        num_classes = logits_prev_mem.shape[-1]
-        if num_classes == 1:
-            preds_prev_mem = torch.sigmoid(logits_prev_mem)
-        else:
-            preds_prev_mem = torch.softmax(logits_prev_mem, dim=-1)
-        self.preds_prev_mem_prev_model = preds_prev_mem.detach()
-
-        # Calculate kernel = J \Sigma J^T for all memory points, and store via cholesky decomposition
-        intermediate_outputs = []
-        for module in self.train_modules:
-            intermediate_outputs.append(module.output)
-        for class_id in range(num_classes):
-            loss_for_class = preds_prev_mem[:, class_id].sum()
-            retain_graph = (
-                True if class_id < num_classes - 1 else None
-            )  # only clean up the graph after the last class
-            grad = self.calculate_grad(
-                loss_for_class,
-                intermediate_outputs,
-                self.train_modules,
-                retain_graph=retain_graph,
-            )
-            kernel = (
-                torch.einsum("ij,j,pj->ip", grad, covariance, grad)
-                + torch.eye(grad.shape[0], dtype=grad.dtype, device=grad.device) * eps
-            )
-            self.kernel_inv_prev_mem_prev_model.append(
-                torch.cholesky_inverse(torch.cholesky(kernel))
-            )
-
-    @classmethod
-    def set_train_modules(cls, module, train_modules):
-        """
-        For calculating Jacobians in PyTorch
-        """
-        if len(list(module.children())) == 0:
-            if len(list(module.parameters())) != 0:
-                train_modules.append(module)
-                module.register_forward_hook(update_input)
-        else:
-            for child in list(module.children()):
-                cls.set_train_modules(child, train_modules)
-
-    @classmethod
-    def calculate_grad(
-        cls, loss, intermediate_outputs, train_modules, retain_graph=None
-    ):
-        """
-        Calculate the gradient (part of calculating Jacobian) of the parameters lc wrt loss
-        """
-        linear_grad = torch.autograd.grad(
-            loss, intermediate_outputs, retain_graph=retain_graph
-        )
-        grad = []
-        for i, module in enumerate(train_modules):
-            g = linear_grad[i]
-            a = module.input.clone().detach()
-            m = a.shape[0]
-
-            if isinstance(module, nn.Linear):
-                grad.append(torch.einsum("ij,ik->ijk", g, a))
-                if module.bias is not None:
-                    grad.append(g)
-
-            if isinstance(module, nn.Conv2d):
-                a = F.unfold(
-                    a,
-                    kernel_size=module.kernel_size,
-                    dilation=module.dilation,
-                    padding=module.padding,
-                    stride=module.stride,
-                )
-                _, k, hw = a.shape
-                _, c, _, _ = g.shape
-                g = g.view(m, c, -1)
-                grad.append(torch.einsum("ijl,ikl->ijk", g, a))
-                if module.bias is not None:
-                    a = torch.ones((m, 1, hw), device=a.device)
-                    grad.append(torch.einsum("ijl,ikl->ijk", g, a))
-
-            if isinstance(module, nn.BatchNorm1d):
-                grad.append(torch.mul(g, a))
-                if module.bias is not None:
-                    grad.append(g)
-
-            if isinstance(module, nn.BatchNorm2d):
-                grad.append(torch.einsum("ijkl->ij", torch.mul(g, a)))
-                if module.bias is not None:
-                    grad.append(torch.einsum("ijkl->ij", g))
-
-        grad_m = parameters_to_matrix(grad)
-        return grad_m.detach()
-
-    @classmethod
-    def calculate_jacobian(cls, output, intermediate_outputs, train_modules):
-        """
-        Calculate the Jacobian matrix
-        """
-        if output.dim() > 2:
-            raise ValueError("the dimension of output must be smaller than 3.")
-        else:  # output.dim() == 2:
-            num_classes = output.shape[1]
-        grad = []
-        for i in range(num_classes):
-            retain_graph = None if i == num_classes - 1 else True
-            loss = output[:, i].sum()
-            g = cls.calculate_grad(
-                loss,
-                intermediate_outputs,
-                train_modules=train_modules,
-                retain_graph=retain_graph,
-            )
-            grad.append(g)
-        result = torch.zeros(
-            (grad[0].shape[0], grad[0].shape[1], num_classes),
-            dtype=grad[0].dtype,
-            device=grad[0].device,
-        )
-        for i in range(num_classes):
-            result[:, :, i] = grad[i]
-        return result
-
-    @classmethod
-    def compute_covariance(cls, data, model):
-        """
-        After training on a new task, update the coviarance matrix estimate
-        """
-        train_modules = []
-        cls.set_train_modules(model, train_modules)
-
-        logits = model.forward(data)
-
-        intermediate_outputs = []
-        for module in train_modules:
-            intermediate_outputs.append(module.output)
-
-        jacobian = cls.calculate_jacobian(logits, intermediate_outputs, train_modules)
-        if logits.shape[-1] == 1:
-            hessian = logistic_hessian(logits).detach()
-            hessian = hessian[:, :, None]
-        else:
-            hessian = full_softmax_hessian(logits).detach()
-        return torch.einsum("ijd,idp,ijp->j", jacobian, hessian, jacobian)
-
-    def post_backward(self, model):
-        parameters = self.model.parameters()
-        grad = parameter_grads_to_vector(parameters).detach()
-        grad *= 1 / self.alpha
-
-        grad_func_reg = torch.zeros_like(
-            grad
-        )  # The gradient corresponding to memorable points
-        # compute predictions of memorable points (from previous task)
-        with eval_state(self.model):
-            memorable_data_prev = self.memorable_points_prev.to(self.device)
-            self.optimizer.zero_grad()
-            logits_prev_mem = self.model.forward(memorable_data_prev)
-
-        num_classes = logits_prev_mem.shape[-1]
-        if num_classes == 1:
-            preds_prev_mem = torch.sigmoid(logits_prev_mem)
-        else:
-            preds_prev_mem = torch.softmax(logits_prev_mem, dim=-1)
-
-        # collect all intermediate outputs:
-        intermediate_outputs = []
-        for module in self.train_modules:
-            intermediate_outputs.append(module.output)
-
-        # compute function loss for each output class:
-        for class_id in range(num_classes):
-            # \Lambda * Jacobian
-            loss_for_class = preds_prev_mem[:, class_id].sum()
-            retain_graph = (
-                True if class_id < num_classes - 1 else None
-            )  # only clean up the graph after the last class
-            jacobian_t = self.calculate_grad(
-                loss_for_class,
-                intermediate_outputs,
-                self.train_modules,
-                retain_graph=retain_graph,
-            )
-
-            # m_t - m_{t-1}
-            delta_preds = (
-                preds_prev_mem[:, class_id].detach()
-                - self.preds_prev_mem_prev_model[:, class_id]
-            )
-
-            # K_{t-1}^{-1}
-            kernel_inv_prev = self.kernel_inv_prev_mem_prev_model[class_id]
-
-            # Uncomment the following line for L2 variants of algorithms
-            # kernel_inv_t = torch.eye(kernel_inv_t.shape[0], device=kernel_inv_t.device)
-
-            # Calculate K_{t-1}^{-1} (m_t - m_{t-1})
-            kinvf_t = torch.squeeze(
-                torch.matmul(kernel_inv_prev, delta_preds[:, None]), dim=-1
-            )
-
-            grad_func_reg += torch.einsum("ij,i->j", jacobian_t, kinvf_t)
-
-        grad += grad_func_reg
-
-        # Do gradient norm clipping
-        if self.grad_clip_norm is not None:
-            grad_norm = torch.norm(grad)
-            grad_norm = (
-                1.0
-                if grad_norm < self.grad_clip_norm
-                else grad_norm / self.grad_clip_norm
-            )
-            grad /= grad_norm
-
-        vector_to_parameter_grads(grad, parameters)
diff --git a/bias_transfer/trainer/main_loop_modules/function_regularization/fromp_utils.py b/bias_transfer/trainer/main_loop_modules/function_regularization/fromp_utils.py
deleted file mode 100644
index 89238d0..0000000
--- a/bias_transfer/trainer/main_loop_modules/function_regularization/fromp_utils.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import torch
-from torch.nn import functional as F
-
-
-def update_input(self, input, output):
-    """
-    Used to register forward hook
-    Args:
-        self:
-        input:
-        output:
-
-    Returns:
-
-    """
-    self.input = input[0].data
-    self.output = output
-
-
-def logistic_hessian(f):
-    """
-    We only calculate the diagonal elements of the hessian
-    """
-    f = f[:, :]
-    pi = torch.sigmoid(f)
-    return pi * (1 - pi)
-
-
-def softmax_hessian(f):
-    s = F.softmax(f, dim=-1)
-    return s - s * s
-
-
-def full_softmax_hessian(f):
-    """
-    Calculate the full softmax hessian
-    """
-    s = F.softmax(f, dim=-1)
-    e = torch.eye(s.shape[-1], dtype=s.dtype, device=s.device)
-    return s[:, :, None] * e[None, :, :] - s[:, :, None] * s[:, None, :]
-
-
-def _check_param_device(param, old_param_device):
-    if old_param_device is None:
-        old_param_device = param.get_device() if param.is_cuda else -1
-    else:
-        warn = False
-        if param.is_cuda:  # check if in same gpu
-            warn = param.get_device() != old_param_device
-        else:  # check if in cpu
-            warn = old_param_device != -1
-        if warn:
-            raise TypeError(
-                "found two parameters on different devices, "
-                "this is currently not supported."
-            )
-    return old_param_device
-
-
-def parameters_to_matrix(parameters):
-    param_device = None
-    mat = []
-    for param in parameters:
-        param_device = _check_param_device(param, param_device)
-        m = param.shape[0]
-        mat.append(param.view(m, -1))
-    return torch.cat(mat, dim=-1)
-
-
-def parameter_grads_to_vector(parameters):
-    param_device = None
-    vec = []
-    for param in parameters:
-        param_device = _check_param_device(param, param_device)
-        if param.grad is None:
-            raise ValueError("gradient not available")
-        vec.append(param.grad.data.view(-1))
-    return torch.cat(vec, dim=-1)
-
-
-def vector_to_parameter_grads(vec, parameters):
-    r"""Convert one vector to the parameters
-
-    Arguments:
-        vec (Tensor): a single vector represents the parameters of a model.
-        parameters (Iterable[Tensor]): an iterator of Tensors that are the
-            parameters of a model.
-    """
-    # Ensure vec of type Tensor
-    if not isinstance(vec, torch.Tensor):
-        raise TypeError(
-            "expected torch.Tensor, but got: {}".format(torch.typename(vec))
-        )
-    # Flag for the device where the parameter is located
-    param_device = None
-
-    # Pointer for slicing the vector for each parameter
-    pointer = 0
-    for param in parameters:
-        # Ensure the parameters are located in the same device
-        param_device = _check_param_device(param, param_device)
-
-        # The length of the parameter
-        num_param = param.numel()
-        # Slice the vector, reshape it, and replace the old data of the parameter
-        param.grad = vec[pointer : pointer + num_param].view_as(param).grad
-
-        # Increment the pointer
-        pointer += num_param
diff --git a/bias_transfer/trainer/main_loop_modules/lottery_ticket_pruning.py b/bias_transfer/trainer/main_loop_modules/lottery_ticket_pruning.py
deleted file mode 100644
index b8ec4c2..0000000
--- a/bias_transfer/trainer/main_loop_modules/lottery_ticket_pruning.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import collections
-
-import numpy as np
-import torch
-import copy
-
-from torch import optim
-
-from bias_transfer.models.utils import weight_reset
-from .main_loop_module import MainLoopModule
-
-EPS = 1e-6
-
-
-class LotteryTicketPruning(MainLoopModule):
-    """
-    Based on the implementation from https://github.com/rahulvigneswaran/Lottery-Ticket-Hypothesis-in-Pytorch
-    (therefore indirectly from https://github.com/ktkth5/lottery-ticket-hyopothesis)
-    """
-
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        if self.config.lottery_ticket.get("pruning", True):
-            n_epochs = self.config.max_iter
-            n_rounds = self.config.lottery_ticket.get("rounds", 1)
-            percent_to_prune = self.config.lottery_ticket.get("percent_to_prune", 80)
-            self.percent_per_round = (
-                1 - (1 - percent_to_prune / 100) ** (1 / n_rounds)
-            ) * 100
-            self.reset_epochs = [
-                r * self.config.lottery_ticket.get("round_length", 100)
-                for r in range(1, n_rounds + 1)
-            ]
-            print("Percent to prune per round:", self.percent_per_round, flush=True)
-            print("Reset before epochs:", list(self.reset_epochs), flush=True)
-
-            # create initial (empty mask):
-            self.mask = self.make_empty_mask(self.trainer.model)
-
-            # save initial state_dict to reset to this point later:
-            if not self.config.lottery_ticket.get("reinit"):
-                self.initial_state_dict = copy.deepcopy(self.trainer.model.state_dict())
-            self.initial_optim_state_dict = None
-            self.initial_scheduler_state_dict = None
-            self.initial_w_scheduler_state_dict = None
-
-    def pre_epoch(
-        self, model, mode, **options
-    ):
-        super().pre_epoch(model, mode, **options)
-        optimizer = self.trainer.optimizer
-        lr_scheduler = self.trainer.lr_scheduler
-        if self.config.lottery_ticket.get("pruning", True):
-            if not self.initial_optim_state_dict and optimizer is not None:
-                self.initial_optim_state_dict = copy.deepcopy(optimizer.state_dict())
-            if not self.initial_scheduler_state_dict and lr_scheduler is not None:
-                self.initial_scheduler_state_dict = copy.deepcopy(
-                    lr_scheduler.state_dict()
-                )
-                if (
-                    hasattr(lr_scheduler, "warmup_scheduler")
-                    and lr_scheduler.warmup_scheduler
-                ):  # for warmup
-                    self.initial_w_scheduler_state_dict = copy.deepcopy(
-                        lr_scheduler.warmup_scheduler.state_dict()
-                    )
-            if self.tracker.epoch in self.reset_epochs and self.train_mode:
-                # Prune the network, i.e. update the mask
-                self.prune_by_percentile(model, self.percent_per_round)
-                print("Reset init in Epoch ", self.epoch, flush=True)
-                self.reset_initialization(
-                    model, self.config.lottery_ticket.get("reinit")
-                )
-                # Reset lr and scheduler:
-                if (
-                    hasattr(lr_scheduler, "warmup_scheduler")
-                    and lr_scheduler.warmup_scheduler
-                ):  # for warmup
-                    lr_scheduler.warmup_scheduler.load_state_dict(
-                        copy.deepcopy(self.initial_w_scheduler_state_dict)
-                    )
-                    lr_scheduler.warmup_scheduler.last_step = -1
-                optimizer.load_state_dict(copy.deepcopy(self.initial_optim_state_dict))
-                optimizer._step_count = 0
-                lr_scheduler.load_state_dict(
-                    copy.deepcopy(self.initial_scheduler_state_dict)
-                )
-                lr_scheduler._step_count = 0
-                lr_scheduler.last_epoch = 0
-
-    def post_backward(self, model):
-        # Freezing Pruned weights by making their gradients Zero
-        for name, p in model.named_parameters():
-            if "weight" in name and self.config.readout_name not in name:
-                tensor = torch.abs(p.data)
-                grad_tensor = p.grad.data
-                p.grad.data = torch.where(
-                    tensor < EPS, torch.zeros_like(grad_tensor), grad_tensor
-                )
-
-    def prune_by_percentile(self, model, percent):
-        # Calculate percentile value
-        if self.config.lottery_ticket.get("global_pruning"):
-            alive_tensors = []
-            step = 0
-            for name, param in model.named_parameters():
-                if (
-                    "weight" in name and self.config.readout_name not in name
-                ):  # We do not prune bias term
-                    alive_tensors.append(
-                        param.data[torch.nonzero(self.mask[step], as_tuple=True)]
-                    )  # flattened array of nonzero values
-                    step += 1
-            alive = torch.cat(alive_tensors)
-            percentile_value = np.percentile(torch.abs(alive).cpu().numpy(), percent)
-
-        step = 0
-        for name, param in model.named_parameters():
-            if (
-                "weight" in name and self.config.readout_name not in name
-            ):  # We do not prune bias term
-                if not self.config.lottery_ticket.get("global_pruning"):
-                    # print(nonzero)
-                    alive = param.data[
-                        torch.nonzero(self.mask[step], as_tuple=True)
-                    ]  # flattened array of nonzero values
-                    abs_alive = torch.abs(alive).cpu().numpy()
-                    percentile_value = np.percentile(abs_alive, percent)
-
-                # Convert Tensors to numpy and calculate
-                new_mask = torch.where(
-                    torch.abs(param.data)
-                    < torch.tensor(percentile_value, device=param.data.device),
-                    torch.zeros_like(self.mask[step]),
-                    self.mask[step],
-                )
-
-                # Apply new weight and mask
-                param.data = param.data * new_mask
-                self.mask[step] = new_mask
-                step += 1
-
-    def make_empty_mask(self, model):
-        """
-        Function to make an empty mask of the same size as the model
-        :param model:
-        :return: mask
-        """
-        step = 0
-        for name, param in model.named_parameters():
-            if "weight" in name and self.config.readout_name not in name:
-                step = step + 1
-        mask = [None] * step
-        step = 0
-        for name, param in model.named_parameters():
-            if "weight" in name and self.config.readout_name not in name:
-                tensor = param.data
-                mask[step] = torch.ones_like(tensor, device=tensor.device)
-                step = step + 1
-        return mask
-
-    def reset_initialization(self, model, reinit=False):
-        if reinit:
-            model.apply(weight_reset)  # new random init
-        step = 0
-        for name, param in model.named_parameters():
-            init = param.data if reinit else self.initial_state_dict[name]
-            if "weight" in name and self.config.readout_name not in name:
-                param.data = self.mask[step] * init
-                step = step + 1
-            elif "bias" in name or "weight" in name:
-                param.data = init
diff --git a/bias_transfer/trainer/main_loop_modules/noise_adv_training.py b/bias_transfer/trainer/main_loop_modules/noise_adv_training.py
deleted file mode 100644
index e55da8c..0000000
--- a/bias_transfer/trainer/main_loop_modules/noise_adv_training.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy as np
-import torch
-from torch import nn
-from functools import partial
-
-from neuralpredictors.training import LongCycler
-from .main_loop_module import MainLoopModule
-
-
-class NoiseAdvTraining(MainLoopModule):
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        self.progress = 0.0
-        if isinstance(self.train_loader, LongCycler):
-            train_loader = self.train_loader.loaders
-        self.step_size = 1 / (
-            self.config.max_iter * len(self.train_loader["img_classification"])
-        )
-        if self.config.noise_adv_regression:
-            self.criterion = nn.MSELoss()
-        else:  # config.noise_adv_classification
-            self.criterion = nn.BCELoss()
-        objectives = {
-            "Training": {"NoiseAdvTraining": {"loss": 0, "normalization": 0}},
-            "Validation": {"NoiseAdvTraining": {"loss": 0, "normalization": 0}},
-            "Test": {"NoiseAdvTraining": {"loss": 0, "normalization": 0}},
-        }
-        self.tracker.add_objectives(objectives)
-
-    def pre_forward(self, model, inputs, task_key, shared_memory):
-        super().pre_forward(model, inputs, task_key, shared_memory)
-        noise_adv_lambda = (
-            2.0 / (1.0 + np.exp(-self.config.noise_adv_gamma * self.progress)) - 1
-        )
-        if self.train_mode:
-            self.progress += self.step_size
-        return partial(model, noise_lambda=noise_adv_lambda), inputs
-
-    def post_forward(self, outputs, loss, targets, **shared_memory):
-        if not self.options.get("noise_adv",True):
-            return outputs, loss, targets
-        applied_std = shared_memory["applied_std"]
-        num_inputs = (applied_std != 0).sum().item()
-        extra_outputs = outputs[0]
-        if applied_std is None:
-            applied_std = torch.zeros_like(
-                extra_outputs["noise_pred"], device=self.device
-            )
-        if self.config.noise_adv_classification:
-            applied_std = (
-                (applied_std > 0.0).type(torch.FloatTensor).to(device=self.device)
-            )
-        noise_loss = self.criterion(extra_outputs["noise_pred"], applied_std)
-        self.tracker.log_objective(
-            noise_loss.item() * num_inputs, (self.mode, "NoiseAdvTraining", "loss")
-        )
-        self.tracker.log_objective(
-            num_inputs, (self.mode, "NoiseAdvTraining", "normalization"),
-        )
-        loss += self.config.noise_adv_loss_factor * noise_loss
-        return outputs, loss, targets
diff --git a/bias_transfer/trainer/main_loop_modules/parameter_regularization/__init__.py b/bias_transfer/trainer/main_loop_modules/parameter_regularization/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/bias_transfer/trainer/main_loop_modules/parameter_regularization/mixup.py b/bias_transfer/trainer/main_loop_modules/parameter_regularization/mixup.py
deleted file mode 100644
index 4386204..0000000
--- a/bias_transfer/trainer/main_loop_modules/parameter_regularization/mixup.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-import torch
-import numpy as np
-
-from bias_transfer.trainer.main_loop_modules.main_loop_module import MainLoopModule
-
-
-class Mixup(MainLoopModule):
-    def __init__(self, trainer):
-        super().__init__(trainer)
-
-    def mixup_data(self, x):
-        """
-        Returns mixed inputs, and saves index and lambdas
-        Adapted from https://github.com/facebookresearch/mixup-cifar10/blob/master/train.py
-        """
-        alpha = self.config.regularization.get("alpha",1.0)
-        if alpha > 0:
-            self.lam = np.random.beta(alpha, alpha)
-        else:
-            self.lam = 1
-
-        batch_size = x.size()[0]
-        self.index = torch.randperm(batch_size).to(self.device)
-        mixed_x = self.lam * x + (1 - self.lam) * x[self.index, :]
-        return mixed_x
-
-    def pre_forward(self, model, inputs, task_key, shared_memory):
-        model, inputs = super().pre_forward(
-            model, inputs, task_key, shared_memory
-        )
-        if self.train_mode:
-            inputs = self.mixup_data(inputs)
-        else:
-            self.lam = 1.0
-            self.index = torch.arange(inputs.size()[0])
-        return model, inputs
-
-    def post_forward(self, outputs, loss, targets, **shared_memory):
-        if self.train_mode:
-            loss += (1 - self.lam) * self.trainer.criterion["img_classification"](
-                outputs, targets[self.index]
-            )
-        loss += self.lam * self.trainer.criterion["img_classification"](
-            outputs, targets
-        )
-        _, predicted = outputs.max(1)
-        correct = 100 * (
-            self.lam * predicted.eq(targets).sum().item()
-            + (1 - self.lam) * predicted.eq(targets[self.index]).sum().item()
-        )
-        self.tracker.log_objective(
-            correct, keys=(self.mode, self.task_key, "accuracy"),
-        )
-        return outputs, loss, targets
diff --git a/bias_transfer/trainer/main_loop_modules/parameter_regularization/param_distance.py b/bias_transfer/trainer/main_loop_modules/parameter_regularization/param_distance.py
deleted file mode 100644
index 763b593..0000000
--- a/bias_transfer/trainer/main_loop_modules/parameter_regularization/param_distance.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import copy
-import os
-from collections import OrderedDict
-
-import torch
-
-from bias_transfer.trainer.main_loop_modules.main_loop_module import MainLoopModule
-
-
-class ParamDistance(MainLoopModule):
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        self.sp_state_dict = OrderedDict()
-        state_dict = self.trainer.model.state_dict()
-        for k, v in state_dict.items():
-            if isinstance(v, torch.Tensor):
-                self.sp_state_dict[k] = v.clone()
-            else:
-                self.sp_state_dict[k] = copy.deepcopy(v)
-        self.warned = False
-        self.alpha = self.config.regularization.get("alpha", 1.0)
-        self.ignore_layers = self.config.regularization.get("ignore_layers", ())
-        objectives = {  # TODO: make adaptable to other tasks!
-            "Training": {"img_classification": {"P-Dist": 0}},
-            "Validation": {"img_classification": {"P-Dist": 0}},
-            "Test": {"img_classification": {"P-Dist": 0}},
-        }
-        self.tracker.add_objectives(objectives, init_epoch=True)
-
-    def post_forward(self, outputs, loss, targets, **shared_memory):
-        model = self.trainer.model
-        if self.train_mode:
-            reg_loss = torch.zeros(1, dtype=torch.float32, device=self.trainer.device)
-            for n, param in model.named_parameters():
-                if n not in self.sp_state_dict:
-                    if not self.warned:
-                        print(f"skipping {n}")
-                        self.warned = True
-                    continue
-                for l in self.ignore_layers:
-                    if l in n:
-                        continue
-                n_ = n.replace(".", "__")
-                importance = getattr(model, f"{n_}_importance", 1.0)
-                distance = (importance * (param - self.sp_state_dict[n]) ** 2).sum()
-                reg_loss = reg_loss + distance
-            loss += self.alpha * reg_loss
-            self.tracker.log_objective(
-                loss.item(), (self.mode, self.task_key, "P-Dist")
-            )
-            return outputs, loss, targets
-        else:
-            return outputs, loss, targets
diff --git a/bias_transfer/trainer/main_loop_modules/parameter_regularization/vcl.py b/bias_transfer/trainer/main_loop_modules/parameter_regularization/vcl.py
deleted file mode 100644
index 5002a85..0000000
--- a/bias_transfer/trainer/main_loop_modules/parameter_regularization/vcl.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import copy
-import os
-from collections import OrderedDict
-from functools import partial
-
-import torch
-
-from bias_transfer.trainer.main_loop_modules.main_loop_module import MainLoopModule
-
-
-class VCL(MainLoopModule):
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        self.eps = self.config.regularization.get("eps", 1e-8)
-        self.num_samples = self.config.regularization.get("num_samples", 10)
-        self.train_len = len(
-            self.trainer.data_loaders["train"]["img_classification"].dataset
-        )
-
-    def pre_forward(self, model, inputs, task_key, shared_memory):
-        super().pre_forward(model, inputs, task_key, shared_memory)
-        model_ = partial(model, num_samples=self.num_samples)
-        return model_, inputs
-
-    def post_forward(self, outputs, loss, targets, **shared_memory):
-        loss += self._calculate_kl_term() / self.train_len
-        targets = targets.repeat(self.num_samples).view(-1)
-        return outputs, loss, targets
-
-    def _calculate_kl_term(self):
-        """
-        Calculates and returns the KL divergence of the new posterior and the previous
-        iteration's posterior. See equation L3, slide 14.
-        """
-        model = self.trainer.model
-        # Prior
-        prior_means = model.get_parameters("prior_mean")
-        prior_log_vars = model.get_parameters("prior_log_var")
-        prior_vars = torch.exp(prior_log_vars)
-
-        # Posterior
-        posterior_means = model.get_parameters("posterior_mean")
-        posterior_log_vars = model.get_parameters("posterior_log_var")
-        posterior_vars = torch.exp(posterior_log_vars)
-
-        # Calculate KL for individual normal distributions over parameters
-        kl_elementwise = (
-            posterior_vars / (prior_vars + self.eps)
-            + torch.pow(prior_means - posterior_means, 2) / (prior_vars + self.eps)
-            - 1
-            + (prior_log_vars - posterior_log_vars)
-        )
-
-        # Sum KL over all parameters
-        return 0.5 * kl_elementwise.sum()
diff --git a/bias_transfer/trainer/main_loop_modules/random_readout_reset.py b/bias_transfer/trainer/main_loop_modules/random_readout_reset.py
deleted file mode 100644
index 14ebe6f..0000000
--- a/bias_transfer/trainer/main_loop_modules/random_readout_reset.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import numpy as np
-import torch
-from torch import nn
-from functools import partial
-
-from .main_loop_module import MainLoopModule
-
-
-class RandomReadoutReset(MainLoopModule):
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        self.batch_progress = 0
-        self.epoch_progress = 0
-
-    def pre_epoch(self, model, mode, **options):
-        super(RandomReadoutReset, self).pre_epoch(model, mode, **options)
-        if self.train_mode and self.config.reset_linear_frequency.get("epoch"):
-            if self.epoch_progress % self.config.reset_linear_frequency["epoch"] == 0:
-                model.module.linear_readout.reset_parameters()
-            self.epoch_progress += 1
-
-    def pre_forward(self, model, inputs, task_key, shared_memory):
-        super().pre_forward(model, inputs, task_key, shared_memory)
-        if self.train_mode and self.config.reset_linear_frequency.get("batch"):
-            if self.batch_progress % self.config.reset_linear_frequency["batch"] == 0:
-                model.module.linear_readout.reset_parameters()
-            self.batch_progress += 1
-        return model, inputs
diff --git a/bias_transfer/trainer/main_loop_modules/representation_regularization/__init__.py b/bias_transfer/trainer/main_loop_modules/representation_regularization/__init__.py
deleted file mode 100644
index cbed237..0000000
--- a/bias_transfer/trainer/main_loop_modules/representation_regularization/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from torch import nn
-import torch
-
-from bias_transfer.trainer.main_loop_modules.main_loop_module import MainLoopModule
-
-
-class RepresentationRegularization(MainLoopModule):
-    def __init__(self, trainer, name="RDL"):
-        super().__init__(trainer)
-        objectives = {  # TODO: make adaptable to other tasks!
-            "Training": {"img_classification": {name: 0}},
-            "Validation": {"img_classification": {name: 0}},
-            "Test": {"img_classification": {name: 0}},
-        }
-        self.tracker.add_objectives(objectives, init_epoch=True)
-        self.name = name
-        self.alpha_0 = self.config.regularization.get("alpha", 1.0)
-        self.alpha = 0.0
-
-    def pre_epoch(self, model, mode, **options):
-        super().pre_epoch(model, mode, **options)
-        if self.config.regularization.get("decay_alpha"):
-            self.alpha = self.alpha_0 * (1 - (self.epoch / self.config.max_iter))
-        else:
-            self.alpha = self.alpha_0
-
-    def rep_distance(self, output, target):
-        raise NotImplementedError()
-
-    def post_forward(self, outputs, loss, targets, **shared_memory):
-        extra_outputs = outputs[0]
-        if self.train_mode and (
-            self.task_key == "transfer" or self.config.single_input_stream
-        ):
-            pred_loss = torch.zeros(1, device=self.device)
-            for key in targets.keys():
-                if key == "class":
-                    continue
-                pred_loss += self.rep_distance(extra_outputs[key], targets[key])
-            loss += self.alpha * pred_loss
-            self.tracker.log_objective(
-                pred_loss.item(), (self.mode, "img_classification", self.name)
-            )
-            return outputs, loss, targets.get("class", next(iter(targets.values())))
-        else:
-            return outputs, loss, targets
diff --git a/bias_transfer/trainer/main_loop_modules/representation_regularization/knowledge_distillation.py b/bias_transfer/trainer/main_loop_modules/representation_regularization/knowledge_distillation.py
deleted file mode 100644
index 64fe603..0000000
--- a/bias_transfer/trainer/main_loop_modules/representation_regularization/knowledge_distillation.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from torch import nn
-import torch
-import torch.nn.functional as F
-
-
-from . import RepresentationRegularization
-
-
-class KnowledgeDistillation(RepresentationRegularization):
-    def __init__(self, trainer):
-        super().__init__(trainer, name="KD")
-        self.criterion = nn.KLDivLoss(reduction="batchmean")
-        self.T = self.config.regularization.get("softmax_temp", 1.0)
-
-    def rep_distance(self, output, target):
-        kd_loss = self.criterion(
-            F.log_softmax(output / self.T, dim=1), F.softmax(target / self.T, dim=1)
-        )
-        return kd_loss * self.T * self.T
diff --git a/bias_transfer/trainer/main_loop_modules/representation_regularization/rdl.py b/bias_transfer/trainer/main_loop_modules/representation_regularization/rdl.py
deleted file mode 100644
index 8bd499a..0000000
--- a/bias_transfer/trainer/main_loop_modules/representation_regularization/rdl.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from torch import nn
-import torch
-
-from . import RepresentationRegularization
-from ...utils import arctanh
-
-
-class RDL(RepresentationRegularization):
-    @staticmethod
-    def centering(K):
-        n = K.shape[0]
-        unit = torch.ones([n, n], device=K.device)
-        I = torch.eye(n, device=K.device)
-        H = I - unit / n
-
-        return torch.mm(
-            torch.mm(H, K), H
-        )  # HKH are the same with KH, KH is the first centering, H(KH) do the second time, results are the sme with one time centering
-        # return np.dot(H, K)  # KH
-
-    @staticmethod
-    def rbf(X, sigma=None):
-        GX = torch.dot(X, X.T)
-        KX = torch.diag(GX) - GX + (torch.diag(GX) - GX).T
-        if sigma is None:
-            mdist = torch.median(KX[KX != 0])
-            sigma = math.sqrt(mdist)
-        KX *= -0.5 / (sigma * sigma)
-        KX = torch.exp(KX)
-        return KX
-
-    @staticmethod
-    def kernel_HSIC(X, Y, sigma):
-        return torch.sum(
-            RDL.centering(RDL.rbf(X, sigma)) * RDL.centering(RDL.rbf(Y, sigma))
-        )
-
-    @staticmethod
-    def linear_HSIC(X, Y):
-        L_X = torch.mm(X, X.T)
-        L_Y = torch.mm(Y, Y.T)
-        return torch.sum(RDL.centering(L_X) * RDL.centering(L_Y))
-
-    @staticmethod
-    def linear_CKA(X, Y):
-        hsic = RDL.linear_HSIC(X, Y)
-        var1 = torch.sqrt(RDL.linear_HSIC(X, X))
-        var2 = torch.sqrt(RDL.linear_HSIC(Y, Y))
-
-        return hsic / (var1 * var2)
-
-    @staticmethod
-    def kernel_CKA(X, Y, sigma=None):
-        hsic = RDL.kernel_HSIC(X, Y, sigma)
-        var1 = torch.sqrt(RDL.kernel_HSIC(X, X, sigma))
-        var2 = torch.sqrt(RDL.kernel_HSIC(Y, Y, sigma))
-
-        return hsic / (var1 * var2)
-
-    @staticmethod
-    def compute_mse_matrix(x, y=None):
-        """
-        see: https://discuss.pytorch.org/t/efficient-distance-matrix-computation/9065
-        Input: x is a Nxd matrix
-               y is an optional Mxd matirx
-        Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
-                if y is not given then use 'y=x'.
-        i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
-        """
-        x_norm = (x ** 2).sum(1).view(-1, 1)
-        if y is not None:
-            y_norm = (y ** 2).sum(1).view(1, -1)
-        else:
-            y = x
-            y_norm = x_norm.view(1, -1)
-
-        dist = x_norm + y_norm - 2.0 * torch.mm(x, torch.transpose(y, 0, 1))
-        return dist
-
-    @staticmethod
-    def compute_rdm(x, dist_measure="corr"):
-        x_flat = x.flatten(1, -1)
-        centered = x_flat - x_flat.mean(dim=0).view(
-            1, -1
-        )  # centered by mean over images
-        if dist_measure == "corr":
-            result = (centered @ centered.transpose(0, 1)) / torch.ger(
-                torch.norm(centered, 2, dim=1), torch.norm(centered, 2, dim=1)
-            )  # see https://de.mathworks.com/help/images/ref/corr2.html
-        else:
-            result = RDL.compute_mse_matrix(centered)
-        return result
-
-    @staticmethod
-    def rdm_comparison(x, y, criterion, dist_measure="corr", use_arctanh=False):
-        rdm_x = RDL.compute_rdm(x, dist_measure).flatten()
-        rdm_y = RDL.compute_rdm(y, dist_measure).flatten()
-        rdm_x = rdm_x.triu(diagonal=1)
-        rdm_y = rdm_y.triu(diagonal=1)
-        if use_arctanh:
-            rdm_x = arctanh(rdm_x)
-            rdm_y = arctanh(rdm_y)
-        return criterion(rdm_x, rdm_y)
-
-    def __init__(self, trainer):
-        super().__init__(trainer, name="RDL")
-        self.criterion = nn.MSELoss()
-        self.dist_measure = self.config.regularization.get("dist_measure")
-
-    def rep_distance(self, output, target):
-        if self.dist_measure == "CKA":
-            return RDL.linear_CKA(output, target)
-        else:
-            return RDL.rdm_comparison(
-                output,
-                target,
-                self.criterion,
-                self.dist_measure,
-                self.config.regularization.get("use_arctanh"),
-            )
diff --git a/bias_transfer/trainer/main_loop_modules/synaptic_intelligence.py b/bias_transfer/trainer/main_loop_modules/synaptic_intelligence.py
deleted file mode 100644
index 3f4010e..0000000
--- a/bias_transfer/trainer/main_loop_modules/synaptic_intelligence.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from .main_loop_module import MainLoopModule
-
-
-class SynapticIntelligence(MainLoopModule):
-    """
-    Implementation adapted from https://github.com/GMvandeVen/continual-learning/blob/master/continual_learner.py
-    """
-
-    def __init__(self, trainer):
-        super().__init__(trainer)
-        # Register starting param-values
-        model = trainer.model
-        for n, p in model.named_parameters():
-            if p.requires_grad:
-                n = n.replace(".", "__")
-                model.register_buffer(f"{n}_SI_prev_task", p.data.clone())
-        # Prepare <dicts> to store running importance estimates and param-values before update ("Synaptic Intelligence")
-        self.params = {n: p for n, p in model.named_parameters() if p.requires_grad}
-        self.w = {}
-        self.old_params = {}
-        for n, p in model.named_parameters():
-            if p.requires_grad:
-                n = n.replace(".", "__")
-                self.w[n] = p.data.clone().zero_()
-                self.old_params[n] = p.data.clone()
-
-    def pre_forward(self, model, inputs, task_key, shared_memory):
-        super().pre_forward(model, inputs, task_key, shared_memory)
-        # Save current parameters
-        for n, p in self.params.items():
-            n = n.replace(".", "__")
-            self.old_params[n] = p.clone().detach()
-        return model, inputs
-
-    def post_optimizer(self, model):
-        # Accumulate the w
-        for n, p in self.params.items():
-            n = n.replace(".", "__")
-            delta = p.detach() - self.old_params[n]
-            if (
-                p.grad is not None
-            ):  # In multi-head network, some head could have no grad (lazy) since no loss go through it.
-                self.w[n] -= p.grad * delta  # w[n] is >=0
-
-    def post_epoch(self, model):
-        # Store to be used in final steps
-        for n, w in self.w.items():
-            model.register_buffer(f"{n}_SI_omega", w)
diff --git a/bias_transfer/trainer/neural_trainer.py b/bias_transfer/trainer/neural_trainer.py
deleted file mode 100644
index d5682f2..0000000
--- a/bias_transfer/trainer/neural_trainer.py
+++ /dev/null
@@ -1,196 +0,0 @@
-from functools import partial
-
-import numpy as np
-from torch import nn
-
-from bias_transfer.trainer.trainer import Trainer
-from bias_transfer.trainer.utils import NBLossWrapper, get_subdict
-from neuralpredictors import measures as mlmeasures
-from nnvision.utility import measures
-from nnvision.utility.measures import get_poisson_loss
-
-
-def trainer(model, dataloaders, seed, uid, cb, eval_only=False, **kwargs):
-    t = NeuralTrainer(dataloaders, model, seed, uid, **kwargs)
-    return t.train(cb)
-
-
-class NeuralTrainer(Trainer):
-    def get_tracker(self):
-        if self.config.track_training:
-            tracker_dict = dict(
-                correlation=partial(
-                    get_correlations(),
-                    self.model,
-                    self.dataloaders["validation"],
-                    device=self.device,
-                    per_neuron=False,
-                ),
-                poisson_loss=partial(
-                    get_poisson_loss(),
-                    self.model,
-                    self.dataloaders["validation"],
-                    device=self.device,
-                    per_neuron=False,
-                    avg=False,
-                ),
-            )
-            if hasattr(self.model, "tracked_values"):
-                tracker_dict.update(self.model.tracked_values)
-            tracker = MultipleObjectiveTracker(**tracker_dict)
-        else:
-            tracker = None
-        return tracker
-
-    def get_training_controls(self):
-        self.criterion, self.stop_closure = {}, {}
-        for k in self.val_keys:
-            if "img_classification" not in k:
-                pass
-                if self.config.loss_weighing:
-                    self.criterion[k] = NBLossWrapper().to(self.device)
-                else:
-                    self.criterion[k] = getattr(
-                        mlmeasures, self.config.loss_functions[k]
-                    )(avg=self.config.avg_loss)
-                self.stop_closure[k] = {}
-                self.stop_closure[k]["eval"] = partial(
-                    getattr(measures, "get_correlations"),
-                    dataloaders=dataloaders["validation"][k],
-                    device=self.device,
-                    per_neuron=False,
-                    avg=True,
-                )
-                self.stop_closure[k]["loss"] = partial(
-                    get_poisson_loss,
-                    dataloaders=dataloaders["validation"][k],
-                    device=self.device,
-                    per_neuron=False,
-                    avg=False,
-                )
-
-        params = list(self.model.parameters())
-        if self.config.loss_weighing:
-            for _, loss_object in self.criterion.items():
-                params += list(loss_object.parameters())
-        self.optimizer = getattr(optim, self.config.optimizer)(
-            params, **self.config.optimizer_options
-        )
-
-    def compute_loss(
-        self,
-        average_loss,
-        correct,
-        data_key,
-        loss,
-        outputs,
-        targets,
-        task_dict,
-        total,
-        total_loss,
-        total_loss_weight,
-    ):
-        if "img_classification" not in data_key:
-            loss += neural_full_objective(
-                self.model,
-                outputs,
-                data_loader,
-                self.criterion["neural"],
-                self.scale_loss,
-                data_key,
-                inputs,
-                targets,
-            )
-            total["neural"] += get_correlations(
-                self.model,
-                batch_dict,
-                device=self.device,
-                as_dict=False,
-                per_neuron=False,
-            )
-            task_dict["neural"]["eval"] = average_loss(total["neural"])
-            total_loss["neural"] += loss.item()
-            task_dict["neural"]["epoch_loss"] = average_loss(total_loss["neural"])
-            if self.config.loss_weighing:
-                total_loss_weight["neural"] += np.exp(
-                    self.criterion["neural"].log_w.item()
-                )
-                task_dict["neural"]["loss_weight"] = average_loss(
-                    total_loss_weight["neural"]
-                )
-        return loss
-
-    def test_neural_model(model, data_loader, device, epoch, eval_type="Validation"):
-        loss = get_poisson_loss(
-            model, data_loader, device, as_dict=False, per_neuron=False
-        )
-        eval = get_correlations(
-            model, data_loader, device=device, as_dict=False, per_neuron=False
-        )
-        results = {"neural": {"eval": eval, "loss": loss}}
-        print(
-            "Neural {} Epoch {}: eval={}, loss={}".format(
-                eval_type, epoch, results["neural"]["eval"], results["neural"]["loss"]
-            )
-        )
-        return results
-
-    def test_final_model(
-        self,
-        best_epoch,
-        best_eval,
-        config,
-        criterion,
-        dataloaders,
-        device,
-        epoch,
-        model,
-        seed,
-        test_n_iterations,
-        val_keys,
-        val_n_iterations,
-    ):
-        # test the final model with noise on the dev-set
-        # test the final model on the test set
-        test_results_dict, dev_final_results_dict = {}, {}
-        for k in self.val_keys:
-            if "img_classification" not in k:
-                dev_final_results = test_neural_model(
-                    model,
-                    data_loader=dataloaders["validation"][k],
-                    device=device,
-                    epoch=epoch,
-                    eval_type="Validation",
-                )
-                test_results = test_neural_model(
-                    model,
-                    data_loader=dataloaders["test"][k],
-                    device=device,
-                    epoch=epoch,
-                    eval_type="Test",
-                )
-                dev_final_results_dict.update(dev_final_results)
-                test_results_dict.update(test_results)
-        final_results = {
-            "test_results": test_results_dict,
-            "dev_eval": best_eval,
-            "epoch": best_epoch,
-            "dev_final_results": dev_final_results_dict,
-        }
-        return final_results, test_results_dict
-
-
-def neural_full_objective(
-    model, outputs, dataloader, criterion, scale_loss, data_key, inputs, targets
-):
-
-    loss = criterion(outputs, targets)
-    loss_scale = (
-        np.sqrt(len(dataloader[data_key].dataset) / inputs.shape[0])
-        if scale_loss
-        else 1.0
-    )
-    loss *= loss_scale
-    if scale_loss:
-        loss += model.regularizer(data_key)
-    return loss
diff --git a/bias_transfer/trainer/regression_trainer.py b/bias_transfer/trainer/regression_trainer.py
deleted file mode 100644
index 82e2daa..0000000
--- a/bias_transfer/trainer/regression_trainer.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from bias_transfer.trainer.utils.checkpointing import RemoteCheckpointing
-from bias_transfer.trainer.img_classification_trainer import ImgClassificationTrainer
-from bias_transfer.trainer.utils import get_subdict, arctanh
-from neuralpredictors.tracking import AdvancedMultipleObjectiveTracker
-
-
-def trainer(model, dataloaders, seed, uid, cb, eval_only=False, **kwargs):
-    t = RegressionTrainer(dataloaders, model, seed, uid, cb, **kwargs)
-    return t.train()
-
-
-class RegressionTrainer(ImgClassificationTrainer):
-    checkpointing_cls = RemoteCheckpointing
-
-    @property
-    def tracker(self):
-        try:
-            return self._tracker
-        except AttributeError:
-            objectives = {
-                "LR": 0,
-                "Training": {"regression": {"loss": 0, "normalization": 0}},
-                "Validation": {
-                    "regression": {"loss": 0, "normalization": 0},
-                    "patience": 0,
-                },
-            }
-            self._tracker = AdvancedMultipleObjectiveTracker(
-                main_objective=("regression", "loss"), **objectives
-            )
-            return self._tracker
-
-    def compute_loss(
-        self, mode, task_key, loss, outputs, targets,
-    ):
-        reg_loss = self.criterion["regression"](outputs.reshape((-1,)), targets)
-        if self.config.scale_loss_with_arctanh:
-            reg_loss = arctanh(reg_loss)
-
-        loss += reg_loss
-        _, predicted = outputs.max(1)
-        batch_size = targets.size(0)
-        self.tracker.log_objective(
-            batch_size, keys=(mode, task_key, "normalization"),
-        )
-        self.tracker.log_objective(
-            loss.item() * batch_size, keys=(mode, task_key, "loss"),
-        )
-        return loss
-
-    def test_final_model(self, epoch, bn_train=""):
-        if not bn_train and self.config.eval_with_bn_train:
-            self.test_final_model(epoch, bn_train=" BN=Train")
-        # test the final model on the test set
-        for k in self.task_keys:
-            objectives = {"Test" + bn_train: {k: {"loss": 0, "normalization": 0,}}}
-            self.tracker.add_objectives(objectives, init_epoch=True)
-            test_result = self.main_loop(
-                epoch=epoch,
-                data_loader=get_subdict(self.data_loaders["test"], [k]),
-                mode="Test" + bn_train,
-                cycler_args={},
-                cycler="LongCycler",
-                module_options={},
-            )
-        return test_result
diff --git a/bias_transfer/trainer/transfer/__init__.py b/bias_transfer/trainer/transfer/__init__.py
deleted file mode 100644
index 9ce68a0..0000000
--- a/bias_transfer/trainer/transfer/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .data_generator import *
\ No newline at end of file
diff --git a/bias_transfer/trainer/transfer/coreset_extraction.py b/bias_transfer/trainer/transfer/coreset_extraction.py
deleted file mode 100644
index 13e93e8..0000000
--- a/bias_transfer/trainer/transfer/coreset_extraction.py
+++ /dev/null
@@ -1,233 +0,0 @@
-from copy import copy
-
-import torch
-import numpy as np
-from tqdm import tqdm
-
-from bias_transfer.dataset.dataset_classes.npy_dataset import NpyDataset
-from bias_transfer.trainer.main_loop_modules.function_regularization.fromp_utils import (
-    logistic_hessian,
-    softmax_hessian,
-)
-
-
-def extract_coreset(
-    data_loader,
-    method,
-    size,
-    model,
-    seed,
-    device,
-    initial_method="",
-    remove_from_data=True,
-    save_trainset=False,
-    **kwargs
-):
-    print(f"Extracting Coreset using {method}")
-    collected_inputs = []
-    collected_labels = []
-    for src, trg in data_loader:
-        collected_inputs.append(src)
-        collected_labels.append(trg)
-    inputs = torch.cat(collected_inputs).numpy()
-    labels = torch.cat(collected_labels).numpy()
-    indices = list(range(len(inputs)))
-    if "k-center" in (method, initial_method):
-        coreset_idx, remain_idx = k_center(inputs, indices, size)
-    elif "fromp" in (method, initial_method):
-        coreset_idx, remain_idx = select_memorable_points(
-            inputs, labels, model, size, device, **kwargs
-        )
-    elif "random_class_balanced" in (method, initial_method):
-        coreset_idx, remain_idx = random_class_balanced(labels, indices, seed, size)
-    else:  # "random":
-        coreset_idx, remain_idx = random(indices, seed, size)
-    if method == "frcl":  # needs an initial extraction run
-        coreset_idx, remain_idx = find_best_inducing_points(
-            inputs, model, size, coreset_idx, remain_idx, device, **kwargs
-        )
-    if save_trainset:
-        if not remove_from_data:
-            remain_idx = list(range(len(inputs)))
-        return {
-            "source": inputs[remain_idx],
-            "source_cs": inputs[coreset_idx],
-            "target": labels[remain_idx],
-            "target_cs": labels[coreset_idx],
-        }
-    else:
-        return {
-            "source_cs": inputs[coreset_idx],
-            "target_cs": labels[coreset_idx],
-        }
-
-
-def random(indices, seed, size):
-    np.random.seed(seed)
-    np.random.shuffle(indices)
-    coreset_idx, remain_idx = indices[:size], indices[size:]
-    return coreset_idx, remain_idx
-
-
-def random_class_balanced(labels, indices, seed, size):
-    np.random.seed(seed)
-    np.random.shuffle(indices)
-    num_classes = max(labels) + 1
-    size_per_class = size / num_classes
-    labels_selected = {l: 0 for l in range(num_classes)}
-    coreset_idx = []
-    remain_idx = []
-    for i, idx in enumerate(indices):
-        if len(coreset_idx) >= size:
-            remain_idx += indices[i:]
-            break
-        if labels_selected[labels[idx]] >= size_per_class:
-            remain_idx.append(idx)
-            continue
-        labels_selected[labels[idx]] += 1
-        coreset_idx.append(labels[idx])
-    return coreset_idx, remain_idx
-
-
-def k_center(dataset, indices, size):
-    def update_distance(dists, x_train, current_id):
-        for i in range(x_train.shape[0]):
-            current_dist = np.linalg.norm(x_train[i, :] - x_train[current_id, :])
-            dists[i] = np.minimum(current_dist, dists[i])
-        return dists
-
-    dists = np.full(dataset.shape[0], np.inf)
-    current_id = 0
-    coreset_idx = []
-    remain_idx = indices
-    for _ in range(size):
-        dists = update_distance(dists, dataset, current_id)
-        coreset_idx.append(current_id)
-        remain_idx.remove(current_id)
-        current_id = np.argmax(dists)
-    return coreset_idx, remain_idx
-
-
-def calculate_induce_quality_statistic(idx, dataset, model, device):
-    """
-    Calculates trace statistic of inducing quality
-    (up to multiplication by prior variance)
-    """
-    statistic = 0
-
-    full_dataset_loader = torch.utils.data.DataLoader(
-        NpyDataset(samples=dataset, targets=dataset),
-        batch_size=500,
-        shuffle=False,
-    )
-    model.eval()
-    with torch.no_grad():
-        phi_z = model.core_forward(torch.tensor(dataset[idx]).to(device))
-        k_zz = phi_z @ phi_z.T
-        inv_k_zz = torch.inverse(k_zz + torch.eye(k_zz.shape[0]).to(device) * 1e-3)
-        for x_batch, _ in full_dataset_loader:
-            phi_x = model.core_forward(x_batch.to(device))
-            k_xz = phi_x @ phi_z.T
-            k_xx = phi_x @ phi_x.T
-            statistic += torch.trace(k_xx - k_xz @ inv_k_zz @ k_xz.T).cpu()
-    return statistic
-
-
-def find_best_inducing_points(
-    dataset,
-    model,
-    size,
-    coreset_idx,
-    remain_idx,
-    device,
-    max_iter=300,
-    early_stop_num_iter=80,
-    verbose=True,
-):
-
-    """Sequentially adds a new point instead of a random one in
-    the initial set of inducing points, if the value of the statstic
-    above lessens, and does not do anything otherwise.
-    - start_inducing_set: list of points to start from
-    - max_iter: maximum number of tries to add a point
-    """
-    score = calculate_induce_quality_statistic(coreset_idx, dataset, model, device)
-    new_point_counter = 0
-    early_stop_counter = 0
-    for i in range(max_iter):
-        add_point = np.random.randint(0, len(remain_idx))
-        remove_point = np.random.randint(0, size)
-        coreset_idx_new = copy(coreset_idx)
-        coreset_idx_new[remove_point] = remain_idx[add_point]
-        score_new = calculate_induce_quality_statistic(
-            coreset_idx_new, dataset, model, device
-        )
-        if score_new < score:
-            remain_idx[add_point] = coreset_idx[remove_point]
-            score, coreset_idx = score_new, coreset_idx_new
-            new_point_counter += 1
-            early_stop_counter = 0
-        else:
-            early_stop_counter += 1
-        if verbose and i % 10 == 0:
-            print("Iteration {} out of {} is in progress".format(i, max_iter))
-            print("Current best statistic is ", round(score.item(), 3))
-            print("New points added ", new_point_counter, "\n")
-        if early_stop_counter == early_stop_num_iter:
-            print("Early stop activated!")
-            break
-    return coreset_idx, remain_idx
-
-
-def select_memorable_points(
-    inputs,
-    labels,
-    model,
-    size,
-    device,
-    descending=True,
-):
-    """
-    Select memorable points ordered by their lambda values (descending=True picks most important points)
-    Adapted from
-    """
-    batch_size = 500
-    dataloader = torch.utils.data.DataLoader(
-        NpyDataset(samples=inputs, targets=labels),
-        batch_size=batch_size,
-        shuffle=False,
-    )
-    num_classes = max(labels) + 1
-    num_points_per_class = int(size / num_classes)
-    scores = {class_id: [] for class_id in range(num_classes)}
-    idx = {class_id: [] for class_id in range(num_classes)}
-    indices = torch.tensor(list(range(inputs.shape[0])))
-    # collect scores
-    for i, (data, target) in tqdm(enumerate(dataloader)):
-        batch_start = i * batch_size
-        batch_end = min((i + 1) * batch_size, inputs.shape[0])
-        data = data.to(device)
-        f = model.forward(data)
-        if f.shape[-1] > 1:
-            lamb = softmax_hessian(f)
-            lamb = torch.sum(lamb, dim=-1)
-        else:
-            lamb = logistic_hessian(f)
-            lamb = torch.squeeze(lamb, dim=-1)
-        lamb = lamb.detach().cpu()  # hessian serves as a proxy for noise precision
-        for class_id in range(num_classes):
-            idx[class_id].append(indices[batch_start:batch_end][target == class_id])
-            scores[class_id].append(lamb[target == class_id])
-
-    # sort by scores
-    coreset_idx = []
-    remain_idx = []
-    for class_id in range(num_classes):
-        idx[class_id] = torch.cat(idx[class_id], dim=0)
-        scores[class_id] = torch.cat(scores[class_id], dim=0)
-        _, indices = scores[class_id].sort(descending=descending)
-
-        coreset_idx.append(idx[class_id][indices[:num_points_per_class]])
-        remain_idx.append(idx[class_id][indices[num_points_per_class:]])
-
-    return torch.cat(coreset_idx), torch.cat(remain_idx)
diff --git a/bias_transfer/trainer/transfer/data_generator.py b/bias_transfer/trainer/transfer/data_generator.py
deleted file mode 100644
index f1dd430..0000000
--- a/bias_transfer/trainer/transfer/data_generator.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from copy import copy
-
-from torch.utils.data.sampler import SubsetRandomSampler
-from tqdm import tqdm
-import torch
-import numpy as np
-import matplotlib.pyplot as plt
-
-from bias_transfer.dataset.dataset_classes.npy_dataset import NpyDataset
-from bias_transfer.trainer.img_classification_trainer import ImgClassificationTrainer
-from bias_transfer.trainer.main_loop_modules.fisher_estimation import FisherEstimation
-from bias_transfer.trainer.main_loop_modules.function_regularization.fromp import FROMP
-from bias_transfer.trainer.regression_trainer import RegressionTrainer
-from bias_transfer.trainer.trainer import Trainer
-from bias_transfer.trainer.transfer.coreset_extraction import extract_coreset
-
-
-class DataGenerator(Trainer):
-    def __init__(self, dataloaders, model, seed, uid, cb, **kwargs):
-        super().__init__(dataloaders, model, seed, uid, cb, **kwargs)
-        self.main_task = list(self.task_keys)[0]
-
-    def train(self):
-        self.tracker.start_epoch()
-        if hasattr(tqdm, "_instances"):
-            tqdm._instances.clear()
-
-        if self.config.save_representation:
-            train = self.generate_rep_dataset(data="train")
-        elif self.config.extract_coreset:
-            save_in_model = self.config.extract_coreset.pop("save_in_model", False)
-            train = extract_coreset(
-                data_loader=self.data_loaders["train"][self.main_task],
-                model=self.model,
-                seed=self.seed,
-                device=self.device,
-                **self.config.extract_coreset,
-            )
-            if f"{self.main_task}_cs" in self.data_loaders["train"]:  # update coreset
-                cs = self.data_loaders["train"][f"{self.main_task}_cs"].dataset
-                train["source_cs"] = np.concatenate([train["source_cs"], cs.samples])
-                train["target_cs"] = np.concatenate([train["target_cs"], cs.targets])
-            if save_in_model:
-                self.model.coreset = torch.tensor(train["source_cs"]).to(self.device)
-        else:
-            train = {}
-
-        if self.config.compute_fisher:
-            self.estimate_fisher(data="train")
-        elif self.config.compute_covariance:
-            train["covariance"] = self.compute_covariance(
-                data="train",
-                batch_size=self.config.compute_covariance.get("batch_size", 32),
-            )
-        elif self.config.compute_si_omega:
-            self.compute_omega()
-
-        if self.config.reset_for_new_task:
-            self.model.reset_for_new_task()
-        return 0.0, {}, self.model.state_dict(), train
-
-    def generate_rep_dataset(self, data):
-        _, collected_outputs = self.main_loop(
-            data_loader=self.data_loaders[data],
-            epoch=0,
-            mode="Validation",
-            return_outputs=True,
-        )
-        outputs = {}
-        for rep_name in collected_outputs[0].keys():
-            outputs[rep_name] = torch.cat(
-                [batch_output[rep_name] for batch_output in collected_outputs]
-            ).numpy()
-        if self.config.save_input:
-            collected_inputs = []
-            data_loader = next(iter(self.data_loaders[data].values()))
-            for src, _ in data_loader:
-                collected_inputs.append(src)
-            outputs["source"] = torch.cat(collected_inputs).numpy()
-        return outputs
-
-    def estimate_fisher(self, data):
-        task_key = next(iter(self.data_loaders[data].keys()))
-        data_loader = self.data_loaders[data][task_key]
-        indices = list(range(len(data_loader.dataset)))
-        np.random.seed(self.seed)
-        np.random.shuffle(indices)
-        indices = indices[: self.config.compute_fisher.get("num_samples", 128)]
-        sampler = SubsetRandomSampler(indices)
-        data_loader = torch.utils.data.DataLoader(
-            data_loader.dataset,
-            batch_size=1,
-            sampler=sampler,
-            num_workers=data_loader.num_workers,
-            pin_memory=data_loader.pin_memory,
-            shuffle=False,
-        )
-        objectives = {
-            "Generation": {task_key: {"loss": 0, "accuracy": 0, "normalization": 0}},
-        }
-        self.tracker.add_objectives(objectives, init_epoch=True)
-        self.main_loop_modules.append(FisherEstimation(trainer=self))
-        self.main_loop(
-            data_loader={task_key: data_loader},
-            epoch=0,
-            mode="Generation",
-            return_outputs=False,
-        )
-
-    def compute_covariance(self, data, batch_size=32):
-        task_key = next(iter(self.data_loaders[data].keys()))
-        data_loader = self.data_loaders[data][task_key]
-        np.random.seed(self.seed)
-        data_loader = torch.utils.data.DataLoader(
-            data_loader.dataset,
-            batch_size=batch_size,
-            num_workers=data_loader.num_workers,
-            pin_memory=data_loader.pin_memory,
-            shuffle=False,
-        )
-        self.model.eval()
-        covariance = 0
-        # self.state['fisher'] = torch.zeros_like(self.state['mu'])
-        for data, label in tqdm(data_loader):
-            data = data.to(self.device)
-            self.optimizer.zero_grad()
-            covariance += FROMP.compute_covariance(data, self.model).cpu()
-        return covariance
-
-    def compute_omega(self):
-        print("Compute Synaptic Intelligence Omega")
-        damping_factor = self.config.compute_si_omega.get("damping_factor", 0.0001)
-        # Loop over all parameters
-        for n, p in self.model.named_parameters():
-            if p.requires_grad:
-                n = n.replace(".", "__")
-
-                # Find/calculate new values for quadratic penalty on parameters
-                p_prev = getattr(
-                    self.model, f"{n}_SI_prev_task"
-                )  # initial param values
-                omega = getattr(self.model, f"{n}_SI_omega")
-                p_current = p.detach().clone()
-                p_change = p_current - p_prev
-                omega_new = omega / (p_change ** 2 + damping_factor)
-
-                # Store these new values in the model
-                self.model.register_buffer(f"{n}_importance", omega_new)
-                delattr(self.model, f"{n}_SI_omega")
-                delattr(self.model, f"{n}_SI_prev_task")
-
-
-class TransferDataGeneratorClassificiation(ImgClassificationTrainer, DataGenerator):
-    pass
-
-
-class TransferDataGeneratorRegression(RegressionTrainer, DataGenerator):
-    pass
-
-
-def trainer(model, dataloaders, seed, uid, cb, eval_only=False, **kwargs):
-    t = TransferDataGeneratorClassificiation(
-        dataloaders, model, seed, uid, cb, **kwargs
-    )
-    return t.train()
-
-
-def regression_trainer(model, dataloaders, seed, uid, cb, eval_only=False, **kwargs):
-    t = TransferDataGeneratorRegression(dataloaders, model, seed, uid, cb, **kwargs)
-    return t.train()
diff --git a/bias_transfer/analysis/__init__.py b/nntransfer/__init__.py
similarity index 100%
rename from bias_transfer/analysis/__init__.py
rename to nntransfer/__init__.py
diff --git a/bias_transfer/analysis/representation/__init__.py b/nntransfer/analysis/__init__.py
similarity index 100%
rename from bias_transfer/analysis/representation/__init__.py
rename to nntransfer/analysis/__init__.py
diff --git a/bias_transfer/analysis/plot.py b/nntransfer/analysis/plot.py
similarity index 100%
rename from bias_transfer/analysis/plot.py
rename to nntransfer/analysis/plot.py
diff --git a/bias_transfer/analysis/results/__init__.py b/nntransfer/analysis/results/__init__.py
similarity index 100%
rename from bias_transfer/analysis/results/__init__.py
rename to nntransfer/analysis/results/__init__.py
diff --git a/bias_transfer/analysis/results/base.py b/nntransfer/analysis/results/base.py
similarity index 96%
rename from bias_transfer/analysis/results/base.py
rename to nntransfer/analysis/results/base.py
index 9bbf049..7008f73 100644
--- a/bias_transfer/analysis/results/base.py
+++ b/nntransfer/analysis/results/base.py
@@ -2,8 +2,8 @@
 import seaborn as sns
 import matplotlib.pyplot as plt
 
-from bias_transfer.analysis.plot import plot
-from bias_transfer.tables.transfer import TransferredTrainedModel
+from nntransfer.analysis.plot import plot
+from nntransfer.tables.transfer import TransferredTrainedModel
 from neuralpredictors.tracking import AdvancedMultipleObjectiveTracker as Tracker
 
 
diff --git a/bias_transfer/analysis/results/noise_transfer.py b/nntransfer/analysis/results/noise_transfer.py
similarity index 100%
rename from bias_transfer/analysis/results/noise_transfer.py
rename to nntransfer/analysis/results/noise_transfer.py
diff --git a/bias_transfer/analysis/train_path/__init__.py b/nntransfer/configs/__init__.py
similarity index 100%
rename from bias_transfer/analysis/train_path/__init__.py
rename to nntransfer/configs/__init__.py
diff --git a/bias_transfer/configs/base.py b/nntransfer/configs/base.py
similarity index 100%
rename from bias_transfer/configs/base.py
rename to nntransfer/configs/base.py
diff --git a/nntransfer/configs/dataset/__init__.py b/nntransfer/configs/dataset/__init__.py
new file mode 100644
index 0000000..85c1058
--- /dev/null
+++ b/nntransfer/configs/dataset/__init__.py
@@ -0,0 +1,2 @@
+from .image import ImageDatasetConfig
+from .base import DatasetConfig
diff --git a/bias_transfer/configs/dataset/base.py b/nntransfer/configs/dataset/base.py
similarity index 87%
rename from bias_transfer/configs/dataset/base.py
rename to nntransfer/configs/dataset/base.py
index 3b4be58..5c78897 100644
--- a/bias_transfer/configs/dataset/base.py
+++ b/nntransfer/configs/dataset/base.py
@@ -1,4 +1,4 @@
-from bias_transfer.configs.base import BaseConfig
+from nntransfer.configs.base import BaseConfig
 
 
 class DatasetConfig(BaseConfig):
diff --git a/nntransfer/configs/dataset/image.py b/nntransfer/configs/dataset/image.py
new file mode 100644
index 0000000..4b44820
--- /dev/null
+++ b/nntransfer/configs/dataset/image.py
@@ -0,0 +1,64 @@
+from typing import Dict, Tuple
+
+from nntransfer.configs.dataset.base import DatasetConfig
+from nntransfer.tables.nnfabrik import Dataset
+
+
+class ImageDatasetConfig(DatasetConfig):
+    config_name = "dataset"
+    table = Dataset()
+    fn = "bias_transfer.dataset.torchvision_dataset_loader"
+
+    data_mean_defaults = {
+        "CIFAR100": (
+            0.5070751592371323,
+            0.48654887331495095,
+            0.4409178433670343,
+        ),
+        "CIFAR10": (0.49139968, 0.48215841, 0.44653091),
+        "SVHN": (0.4377, 0.4438, 0.4728),
+        "MNIST": (0.1307,),
+    }
+    data_std_defaults = {
+        "CIFAR100": (
+            0.2673342858792401,
+            0.2564384629170883,
+            0.27615047132568404,
+        ),
+        "CIFAR10": (0.24703223, 0.24348513, 0.26158784),
+        "SVHN": (0.1980, 0.2010, 0.1970),
+        "MNIST": (0.3081,),
+    }
+
+    def __init__(self, **kwargs):
+        self.load_kwargs(**kwargs)
+
+        self.dataset_cls: str = "CIFAR10"
+        self.apply_augmentation: bool = True
+        self.apply_normalization: bool = True
+        self.apply_grayscale: bool = False
+        self.apply_noise: Dict = {}
+        self.convert_to_rgb: bool = False
+        self.input_size: int = 32
+        self.add_corrupted_test: bool = False
+        self.add_stylized_test: bool = False
+        self.use_c_test_as_val: bool = False
+        self.show_sample: bool = False
+        self.filter_classes: Tuple = ()  # (start,end)
+        self.data_dir: str = "./data/image_classification/torchvision/"
+        self.num_workers: int = 1
+        dataset_id = (
+            f"{self.dataset_sub_cls}_{self.bias}" if self.bias else self.dataset_cls
+        )
+        dataset_id += "_bw" if self.apply_grayscale else ""
+        self.train_data_mean: Tuple[float] = self.data_mean_defaults[dataset_id]
+        self.train_data_std: Tuple[float] = self.data_std_defaults[dataset_id]
+
+        super().__init__(**kwargs)
+
+    @property
+    def filters(self):
+        filters = []
+        if self.filter_classes:
+            filters.append("ClassesFilter")
+        return filters
diff --git a/bias_transfer/configs/experiment.py b/nntransfer/configs/experiment.py
similarity index 98%
rename from bias_transfer/configs/experiment.py
rename to nntransfer/configs/experiment.py
index b8f59bf..67146b8 100644
--- a/bias_transfer/configs/experiment.py
+++ b/nntransfer/configs/experiment.py
@@ -1,7 +1,7 @@
 from typing import Dict
 
 from .base import BaseConfig
-from bias_transfer.tables.nnfabrik import *
+from nntransfer.tables.nnfabrik import *
 
 
 class Experiment(BaseConfig):
diff --git a/bias_transfer/configs/__init__.py b/nntransfer/configs/model/__init__.py
similarity index 100%
rename from bias_transfer/configs/__init__.py
rename to nntransfer/configs/model/__init__.py
diff --git a/bias_transfer/configs/model/base.py b/nntransfer/configs/model/base.py
similarity index 75%
rename from bias_transfer/configs/model/base.py
rename to nntransfer/configs/model/base.py
index db8b33b..6e953ec 100644
--- a/bias_transfer/configs/model/base.py
+++ b/nntransfer/configs/model/base.py
@@ -1,7 +1,7 @@
 from typing import Dict
 
-from bias_transfer.configs.base import BaseConfig
-from bias_transfer.tables.nnfabrik import Model
+from nntransfer.configs.base import BaseConfig
+from nntransfer.tables.nnfabrik import Model
 
 
 class ModelConfig(BaseConfig):
diff --git a/nntransfer/configs/trainer/__init__.py b/nntransfer/configs/trainer/__init__.py
new file mode 100644
index 0000000..d6109b6
--- /dev/null
+++ b/nntransfer/configs/trainer/__init__.py
@@ -0,0 +1,2 @@
+from .base import TrainerConfig
+from . import mixins
\ No newline at end of file
diff --git a/bias_transfer/configs/trainer/base.py b/nntransfer/configs/trainer/base.py
similarity index 95%
rename from bias_transfer/configs/trainer/base.py
rename to nntransfer/configs/trainer/base.py
index 1bfc55d..6e2e76c 100644
--- a/bias_transfer/configs/trainer/base.py
+++ b/nntransfer/configs/trainer/base.py
@@ -1,7 +1,7 @@
 from typing import Dict, Tuple
 
-from bias_transfer.configs.base import BaseConfig
-from bias_transfer.tables.nnfabrik import *
+from nntransfer.configs.base import BaseConfig
+from nntransfer.tables.nnfabrik import *
 
 
 class TrainerConfig(BaseConfig):
diff --git a/nntransfer/configs/trainer/mixins/__init__.py b/nntransfer/configs/trainer/mixins/__init__.py
new file mode 100644
index 0000000..31a7673
--- /dev/null
+++ b/nntransfer/configs/trainer/mixins/__init__.py
@@ -0,0 +1,5 @@
+from .noise import (
+    RepresentationMatchingMixin,
+    RepresentationMonitorMixin,
+    NoiseAugmentationMixin,
+)
diff --git a/bias_transfer/configs/trainer/mixins/noise.py b/nntransfer/configs/trainer/mixins/noise.py
similarity index 79%
rename from bias_transfer/configs/trainer/mixins/noise.py
rename to nntransfer/configs/trainer/mixins/noise.py
index 95ac107..5ebe756 100644
--- a/bias_transfer/configs/trainer/mixins/noise.py
+++ b/nntransfer/configs/trainer/mixins/noise.py
@@ -1,6 +1,6 @@
 from typing import Dict
 
-from bias_transfer.configs.base import BaseConfig
+from nntransfer.configs.base import BaseConfig
 
 
 class NoiseAugmentationMixin(BaseConfig):
@@ -52,25 +52,6 @@ def conditional_assignment(self):
         super().conditional_assignment()
 
 
-class NoiseAdversarialMixin(BaseConfig):
-    def __init__(self, **kwargs):
-        self.load_kwargs(**kwargs)
-
-        self.noise_adv_classification: bool = False
-        self.noise_adv_regression: bool = False
-        self.noise_adv_loss_factor: float = 1.0
-        self.noise_adv_gamma: float = 10.0
-
-        super().__init__(**kwargs)
-
-    def conditional_assignment(self):
-        if (
-            self.noise_adv_classification or self.noise_adv_regression
-        ) and not "NoiseAdvTraining" in self.main_loop_modules:
-            self.main_loop_modules.append("NoiseAdvTraining")
-        super().conditional_assignment()
-
-
 class RepresentationMatchingMixin(BaseConfig):
     def __init__(self, **kwargs):
         self.load_kwargs(**kwargs)
diff --git a/bias_transfer/configs/transfer_experiment.py b/nntransfer/configs/transfer_experiment.py
similarity index 100%
rename from bias_transfer/configs/transfer_experiment.py
rename to nntransfer/configs/transfer_experiment.py
diff --git a/bias_transfer/dataset/dataset_classes/__init__.py b/nntransfer/dataset/__init__.py
similarity index 100%
rename from bias_transfer/dataset/dataset_classes/__init__.py
rename to nntransfer/dataset/__init__.py
diff --git a/bias_transfer/gp/__init__.py b/nntransfer/dataset/dataset_classes/__init__.py
similarity index 100%
rename from bias_transfer/gp/__init__.py
rename to nntransfer/dataset/dataset_classes/__init__.py
diff --git a/bias_transfer/dataset/dataset_classes/combined_dataset.py b/nntransfer/dataset/dataset_classes/combined_dataset.py
similarity index 100%
rename from bias_transfer/dataset/dataset_classes/combined_dataset.py
rename to nntransfer/dataset/dataset_classes/combined_dataset.py
diff --git a/bias_transfer/dataset/dataset_classes/npy_dataset.py b/nntransfer/dataset/dataset_classes/npy_dataset.py
similarity index 100%
rename from bias_transfer/dataset/dataset_classes/npy_dataset.py
rename to nntransfer/dataset/dataset_classes/npy_dataset.py
diff --git a/bias_transfer/dataset/dataset_classes/pkl_dataset.py b/nntransfer/dataset/dataset_classes/pkl_dataset.py
similarity index 100%
rename from bias_transfer/dataset/dataset_classes/pkl_dataset.py
rename to nntransfer/dataset/dataset_classes/pkl_dataset.py
diff --git a/bias_transfer/dataset/dataset_filters/__init__.py b/nntransfer/dataset/dataset_filters/__init__.py
similarity index 100%
rename from bias_transfer/dataset/dataset_filters/__init__.py
rename to nntransfer/dataset/dataset_filters/__init__.py
diff --git a/bias_transfer/dataset/dataset_filters/classes_filter.py b/nntransfer/dataset/dataset_filters/classes_filter.py
similarity index 100%
rename from bias_transfer/dataset/dataset_filters/classes_filter.py
rename to nntransfer/dataset/dataset_filters/classes_filter.py
diff --git a/bias_transfer/dataset/dataset_filters/dataset_filter.py b/nntransfer/dataset/dataset_filters/dataset_filter.py
similarity index 100%
rename from bias_transfer/dataset/dataset_filters/dataset_filter.py
rename to nntransfer/dataset/dataset_filters/dataset_filter.py
diff --git a/nntransfer/dataset/img_dataset_loader.py b/nntransfer/dataset/img_dataset_loader.py
new file mode 100644
index 0000000..cbec3e1
--- /dev/null
+++ b/nntransfer/dataset/img_dataset_loader.py
@@ -0,0 +1,295 @@
+import os
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import ConcatDataset, Subset
+from torch.utils.data.sampler import SubsetRandomSampler
+from torchvision import datasets
+from nntransfer.configs.dataset.image import ImageDatasetConfig
+from nntransfer.nntransfer.dataset.utils import get_dataset
+
+DATASET_URLS = {
+    "CIFAR10-C": "https://zenodo.org/record/2535967/files/CIFAR-10-C.tar",
+    "CIFAR100-C": "https://zenodo.org/record/3555552/files/CIFAR-100-C.tar",
+    "TinyImageNet-C": "https://zenodo.org/record/2536630/files/Tiny-ImageNet-C.tar",
+    "TinyImageNet-ST": "https://informatikunihamburgde-my.sharepoint.com/:u:/g/personal/shahd_safarani_informatik_uni-hamburg_de/EZhUKKVXTvRHlqi2HXHaIjEBLmAv4tQP8olvdGNRoWrPqA?e=8kSrHI&download=1",
+    "ImageNet-C": {
+        "blur": "https://zenodo.org/record/2235448/files/blur.tar",
+        "digital": "https://zenodo.org/record/2235448/files/digital.tar",
+        "extra": "https://zenodo.org/record/2235448/files/extra.tar",
+        "noise": "https://zenodo.org/record/2235448/files/noise.tar",
+        "weather": "https://zenodo.org/record/2235448/files/weather.tar",
+    },
+}
+
+
+class ImageDatasetLoader:
+    def __call__(self, seed, **config):
+        """
+        Utility function for loading and returning train and valid
+        multi-process iterators over the CIFAR-10 dataset. A sample
+        9x9 grid of the images can be optionally displayed.
+        If using CUDA, num_workers should be set to 1 and pin_memory to True.
+        Params
+        ------
+        - data_dir: path directory to the dataset.
+        - batch_size: how many samples per batch to load.
+        - augment: whether to apply the data augmentation scheme
+          mentioned in the paper. Only applied on the train split.
+        - seed: fix seed for reproducibility.
+        - valid_size: percentage split of the training set used for
+          the validation set. Should be a float in the range [0, 1].
+        - shuffle: whether to shuffle the train/validation indices.
+        - show_sample: plot 9x9 sample grid of the dataset.
+        - num_workers: number of subprocesses to use when loading the dataset.
+        - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
+          True if using GPU.
+        Returns
+        -------
+        - train_loader: training set iterator.
+        - valid_loader: validation set iterator.
+        """
+        config = ImageDatasetConfig.from_dict(config)
+        print("Loading dataset: {}".format(config.dataset_cls))
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+
+        transform_test, transform_train, transform_val = self.get_transforms(config)
+
+        error_msg = "[!] valid_size should be in the range [0, 1]."
+        assert (config.valid_size >= 0) and (config.valid_size <= 1), error_msg
+
+        (
+            train_dataset,
+            valid_dataset,
+            test_dataset,
+            c_test_datasets,
+            st_test_dataset,
+        ) = self.get_datasets(config, transform_test, transform_train, transform_val)
+
+        filters = [globals().get(f)(config, train_dataset) for f in config.filters]
+        datasets_ = [train_dataset, valid_dataset, test_dataset]
+        if config.add_corrupted_test:
+            for c_ds in c_test_datasets.values():
+                datasets_ += list(c_ds.values())
+        for ds in datasets_:
+            for filt in filters:
+                filt.apply(ds)
+
+        data_loaders = self.get_data_loaders(
+            st_test_dataset,
+            c_test_datasets,
+            config,
+            seed,
+            test_dataset,
+            train_dataset,
+            valid_dataset,
+        )
+
+        return data_loaders
+
+    def get_transforms(self, config):
+        """
+
+        Args:
+            config:
+
+        Returns:
+            transform_test,
+            transform_train,
+            transform_val
+        """
+        raise NotImplementedError()
+
+    def get_datasets(self, config, transform_test, transform_train, transform_val):
+        """
+
+        Args:
+            config:
+            transform_test:
+            transform_train:
+            transform_val:
+
+        Returns:
+            train_dataset,
+            valid_dataset,
+            test_dataset,
+            c_test_datasets,
+            st_test_dataset,
+        """
+        raise NotImplementedError()
+
+    def add_corrupted_test(self, config, transform_test):
+        c_test_datasets = None
+        if config.add_corrupted_test:
+            urls = DATASET_URLS[config.dataset_cls + "-C"]
+            if not isinstance(urls, dict):
+                urls = {"default": urls}
+            for key, url in urls.items():
+                dataset_dir = get_dataset(
+                    url,
+                    config.data_dir,
+                    dataset_cls=config.dataset_cls + "-C",
+                )
+
+                c_test_datasets = {}
+                for c_category in os.listdir(dataset_dir):
+                    if config.dataset_cls in ("CIFAR10", "CIFAR100"):
+                        if c_category == "labels.npy" or not c_category.endswith(
+                            ".npy"
+                        ):
+                            continue
+                        c_test_datasets[c_category[:-4]] = {}
+                        for c_level in range(1, 6):
+                            start = (c_level - 1) * 10000
+                            end = c_level * 10000
+                            c_test_datasets[c_category[:-4]][c_level] = NpyDataset(
+                                samples=c_category,
+                                targets="labels.npy",
+                                root=dataset_dir,
+                                start=start,
+                                end=end,
+                                transform=transform_test,
+                            )
+                    else:
+                        if not os.path.isdir(os.path.join(dataset_dir, c_category)):
+                            continue
+                        c_test_datasets[c_category] = {}
+                        for c_level in os.listdir(
+                            os.path.join(dataset_dir, c_category)
+                        ):
+                            c_test_datasets[c_category][
+                                int(c_level)
+                            ] = datasets.ImageFolder(
+                                os.path.join(dataset_dir, c_category, c_level),
+                                transform=transform_test,
+                            )
+        return c_test_datasets
+
+    def add_stylized_test(self, config, transform_test):
+        st_test_dataset = None
+        if config.add_stylized_test:
+            st_dataset_dir = get_dataset(
+                DATASET_URLS[config.dataset_cls + "-ST"],
+                config.data_dir,
+                dataset_cls=config.dataset_cls + "-ST",
+            )
+            st_test_dataset = datasets.ImageFolder(
+                st_dataset_dir, transform=transform_test
+            )
+        return st_test_dataset
+
+    def get_data_loaders(
+        self,
+        st_test_dataset,
+        c_test_datasets,
+        config,
+        seed,
+        test_dataset,
+        train_dataset,
+        valid_dataset,
+    ):
+        num_train = len(train_dataset)
+        indices = list(range(num_train))
+        if config.use_c_test_as_val:  # Use valid_size of the c_test set for validation
+            train_sampler = SubsetRandomSampler(indices)
+            datasets = []
+            val_indices = []
+            start_idx = 0
+            for c_category in c_test_datasets.keys():
+                if c_category not in (
+                    "speckle_noise",
+                    "gaussian_blur",
+                    "spatter",
+                    "saturate",
+                ):
+                    continue
+                for dataset in c_test_datasets[c_category].values():
+                    num_val = len(dataset)
+                    indices = list(range(start_idx, start_idx + num_val))
+                    split = int(np.floor(config.valid_size * num_val))
+                    if config.shuffle:
+                        np.random.shuffle(indices)
+                    val_indices += indices[:split]
+                    datasets.append(dataset)
+                    start_idx += num_val
+            valid_dataset = ConcatDataset(datasets)
+            valid_sampler = SubsetRandomSampler(val_indices)
+        else:  # Use valid_size of the train set for validation
+            split = int(np.floor(config.valid_size * num_train))
+            if config.shuffle:
+                np.random.seed(seed)
+                np.random.shuffle(indices)
+            train_idx, valid_idx = indices[split:], indices[:split]
+            if config.train_subset:
+                subset_split = int(np.floor(config.train_subset * len(train_idx)))
+                train_idx = train_idx[:subset_split]
+            if config.shuffle:
+                train_sampler = SubsetRandomSampler(train_idx)
+                valid_sampler = SubsetRandomSampler(valid_idx)
+            else:
+                train_dataset = Subset(train_dataset, train_idx)
+                valid_dataset = Subset(train_dataset, valid_idx)
+                train_sampler = None
+                valid_sampler = None
+        train_loader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=config.batch_size,
+            sampler=train_sampler,
+            num_workers=config.num_workers,
+            pin_memory=config.pin_memory,
+            shuffle=False,
+        )
+        valid_loader = torch.utils.data.DataLoader(
+            valid_dataset,
+            batch_size=config.batch_size,
+            sampler=valid_sampler,
+            num_workers=config.num_workers,
+            pin_memory=config.pin_memory,
+            shuffle=False,
+        )
+        test_loader = torch.utils.data.DataLoader(
+            test_dataset,
+            batch_size=config.batch_size,
+            num_workers=config.num_workers,
+            pin_memory=config.pin_memory,
+            shuffle=True,
+        )
+        task_key = (
+            "regression"
+            if config.bias is not None and "regression" in config.bias
+            else "img_classification"
+        )
+        data_loaders = {
+            "train": {task_key: train_loader},
+            "validation": {task_key: valid_loader},
+            "test": {task_key: test_loader},
+        }
+
+        if config.add_stylized_test:
+            st_test_loader = torch.utils.data.DataLoader(
+                st_test_dataset,
+                batch_size=config.batch_size,
+                num_workers=config.num_workers,
+                pin_memory=config.pin_memory,
+                shuffle=False,
+            )
+            data_loaders["st_test"] = st_test_loader
+
+        if config.add_corrupted_test:
+            c_test_loaders = {}
+            for c_category in c_test_datasets.keys():
+                c_test_loaders[c_category] = {}
+                for c_level, dataset in c_test_datasets[c_category].items():
+                    c_test_loaders[c_category][c_level] = torch.utils.data.DataLoader(
+                        dataset,
+                        batch_size=config.batch_size,
+                        num_workers=config.num_workers,
+                        pin_memory=config.pin_memory,
+                        shuffle=True,
+                    )
+            data_loaders["c_test"] = {"img_classification": c_test_loaders}
+        return data_loaders
+
+
diff --git a/bias_transfer/dataset/scripts/download_imagenet.sh b/nntransfer/dataset/scripts/download_imagenet.sh
similarity index 100%
rename from bias_transfer/dataset/scripts/download_imagenet.sh
rename to nntransfer/dataset/scripts/download_imagenet.sh
diff --git a/bias_transfer/dataset/utils.py b/nntransfer/dataset/utils.py
similarity index 100%
rename from bias_transfer/dataset/utils.py
rename to nntransfer/dataset/utils.py
diff --git a/nntransfer/models/__init__.py b/nntransfer/models/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/nntransfer/models/__init__.py
@@ -0,0 +1 @@
+
diff --git a/bias_transfer/models/lenet.py b/nntransfer/models/lenet.py
similarity index 100%
rename from bias_transfer/models/lenet.py
rename to nntransfer/models/lenet.py
diff --git a/bias_transfer/models/mlp.py b/nntransfer/models/mlp.py
similarity index 100%
rename from bias_transfer/models/mlp.py
rename to nntransfer/models/mlp.py
diff --git a/bias_transfer/models/resnet.py b/nntransfer/models/resnet.py
similarity index 100%
rename from bias_transfer/models/resnet.py
rename to nntransfer/models/resnet.py
diff --git a/bias_transfer/models/utils.py b/nntransfer/models/utils.py
similarity index 98%
rename from bias_transfer/models/utils.py
rename to nntransfer/models/utils.py
index de3010d..47a82b5 100644
--- a/bias_transfer/models/utils.py
+++ b/nntransfer/models/utils.py
@@ -3,7 +3,7 @@
 import torch
 from torch import nn
 from torchvision.models.resnet import Bottleneck, BasicBlock
-from bias_transfer.models.resnet import ResNet
+from nntransfer.models.resnet import ResNet
 
 
 def reset_params(model, reset=None):
diff --git a/bias_transfer/models/vgg.py b/nntransfer/models/vgg.py
similarity index 100%
rename from bias_transfer/models/vgg.py
rename to nntransfer/models/vgg.py
diff --git a/bias_transfer/models/wrappers/__init__.py b/nntransfer/models/wrappers/__init__.py
similarity index 61%
rename from bias_transfer/models/wrappers/__init__.py
rename to nntransfer/models/wrappers/__init__.py
index e91f18e..d110c22 100644
--- a/bias_transfer/models/wrappers/__init__.py
+++ b/nntransfer/models/wrappers/__init__.py
@@ -1,2 +1 @@
-from .noise_adv import NoiseAdvWrapper
 from .intermediate_layer_getter import IntermediateLayerGetter
\ No newline at end of file
diff --git a/bias_transfer/models/wrappers/intermediate_layer_getter.py b/nntransfer/models/wrappers/intermediate_layer_getter.py
similarity index 100%
rename from bias_transfer/models/wrappers/intermediate_layer_getter.py
rename to nntransfer/models/wrappers/intermediate_layer_getter.py
diff --git a/bias_transfer/tables/__init__.py b/nntransfer/tables/__init__.py
similarity index 100%
rename from bias_transfer/tables/__init__.py
rename to nntransfer/tables/__init__.py
diff --git a/bias_transfer/tables/nnfabrik.py b/nntransfer/tables/nnfabrik.py
similarity index 100%
rename from bias_transfer/tables/nnfabrik.py
rename to nntransfer/tables/nnfabrik.py
diff --git a/bias_transfer/tables/trained_model.py b/nntransfer/tables/trained_model.py
similarity index 100%
rename from bias_transfer/tables/trained_model.py
rename to nntransfer/tables/trained_model.py
diff --git a/bias_transfer/tables/transfer.py b/nntransfer/tables/transfer.py
similarity index 100%
rename from bias_transfer/tables/transfer.py
rename to nntransfer/tables/transfer.py
diff --git a/nntransfer/trainer/__init__.py b/nntransfer/trainer/__init__.py
new file mode 100644
index 0000000..0378809
--- /dev/null
+++ b/nntransfer/trainer/__init__.py
@@ -0,0 +1,2 @@
+from bias_transfer.trainer.transfer import trainer as transfer
+from bias_transfer.trainer.transfer import regression_trainer as regression_transfer
diff --git a/nntransfer/trainer/main_loop_modules/__init__.py b/nntransfer/trainer/main_loop_modules/__init__.py
new file mode 100644
index 0000000..b3c0b60
--- /dev/null
+++ b/nntransfer/trainer/main_loop_modules/__init__.py
@@ -0,0 +1,4 @@
+from .noise_augmentation import NoiseAugmentation
+from .representation_matching import RepresentationMatching
+from .representation_monitor import RepresentationMonitor
+from .model_wrapper import ModelWrapper
diff --git a/bias_transfer/trainer/main_loop_modules/main_loop_module.py b/nntransfer/trainer/main_loop_modules/main_loop_module.py
similarity index 100%
rename from bias_transfer/trainer/main_loop_modules/main_loop_module.py
rename to nntransfer/trainer/main_loop_modules/main_loop_module.py
diff --git a/bias_transfer/trainer/main_loop_modules/model_wrapper.py b/nntransfer/trainer/main_loop_modules/model_wrapper.py
similarity index 100%
rename from bias_transfer/trainer/main_loop_modules/model_wrapper.py
rename to nntransfer/trainer/main_loop_modules/model_wrapper.py
diff --git a/bias_transfer/trainer/main_loop_modules/noise_augmentation.py b/nntransfer/trainer/main_loop_modules/noise_augmentation.py
similarity index 100%
rename from bias_transfer/trainer/main_loop_modules/noise_augmentation.py
rename to nntransfer/trainer/main_loop_modules/noise_augmentation.py
diff --git a/bias_transfer/trainer/main_loop_modules/representation_matching.py b/nntransfer/trainer/main_loop_modules/representation_matching.py
similarity index 100%
rename from bias_transfer/trainer/main_loop_modules/representation_matching.py
rename to nntransfer/trainer/main_loop_modules/representation_matching.py
diff --git a/bias_transfer/trainer/main_loop_modules/representation_monitor.py b/nntransfer/trainer/main_loop_modules/representation_monitor.py
similarity index 100%
rename from bias_transfer/trainer/main_loop_modules/representation_monitor.py
rename to nntransfer/trainer/main_loop_modules/representation_monitor.py
diff --git a/bias_transfer/trainer/trainer.py b/nntransfer/trainer/trainer.py
similarity index 95%
rename from bias_transfer/trainer/trainer.py
rename to nntransfer/trainer/trainer.py
index ba1fd0f..e422097 100644
--- a/bias_transfer/trainer/trainer.py
+++ b/nntransfer/trainer/trainer.py
@@ -4,22 +4,24 @@
 from tqdm import tqdm
 import torch
 from torch import optim, nn
+
 import nnfabrik as nnf
 from neuralpredictors.training import copy_state
+from nnfabrik.utility.nn_helpers import load_state_dict
 
-from bias_transfer.models.utils import (
+from nntransfer.models.utils import (
     freeze_params,
     set_bn_to_eval,
     weight_reset,
     reset_params,
 )
-from bias_transfer.trainer.utils import SchedulerWrapper
-from bias_transfer.configs.trainer import TrainerConfig
-from nnfabrik.utility.nn_helpers import load_state_dict
-from bias_transfer.trainer.utils.checkpointing import LocalCheckpointing, RemoteCheckpointing
-from bias_transfer.trainer.main_loop_modules import *
-from bias_transfer.trainer.utils import LongCycler, MTL_Cycler
-from bias_transfer.trainer.utils.early_stopping import early_stopping
+from nntransfer.configs.trainer import TrainerConfig
+
+from .main_loop_modules import *
+from .utils import SchedulerWrapper
+from .utils.checkpointing import LocalCheckpointing, RemoteCheckpointing
+from .utils import LongCycler, MTL_Cycler
+from .utils.early_stopping import early_stopping
 
 
 class Trainer:
diff --git a/bias_transfer/trainer/utils/__init__.py b/nntransfer/trainer/utils/__init__.py
similarity index 100%
rename from bias_transfer/trainer/utils/__init__.py
rename to nntransfer/trainer/utils/__init__.py
diff --git a/bias_transfer/trainer/utils/checkpointing.py b/nntransfer/trainer/utils/checkpointing.py
similarity index 100%
rename from bias_transfer/trainer/utils/checkpointing.py
rename to nntransfer/trainer/utils/checkpointing.py
diff --git a/bias_transfer/trainer/utils/early_stopping.py b/nntransfer/trainer/utils/early_stopping.py
similarity index 98%
rename from bias_transfer/trainer/utils/early_stopping.py
rename to nntransfer/trainer/utils/early_stopping.py
index d57dd53..cef66ee 100644
--- a/bias_transfer/trainer/utils/early_stopping.py
+++ b/nntransfer/trainer/utils/early_stopping.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from bias_transfer.trainer.utils import StopClosureWrapper
+from nntransfer.trainer.utils import StopClosureWrapper
 
 
 def early_stopping(
diff --git a/bias_transfer/trainer/utils/loss.py b/nntransfer/trainer/utils/loss.py
similarity index 100%
rename from bias_transfer/trainer/utils/loss.py
rename to nntransfer/trainer/utils/loss.py
diff --git a/bias_transfer/trainer/utils/warmup.py b/nntransfer/trainer/utils/warmup.py
similarity index 100%
rename from bias_transfer/trainer/utils/warmup.py
rename to nntransfer/trainer/utils/warmup.py
diff --git a/setup.py b/setup.py
index 679e306..d39eb01 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,11 @@
 from setuptools import setup
 
 setup(
-   name='bias_transfer',
-   version='0.1dev',
-   description='Experiments about inductive bias transfer',
-   author='Arne Nix',
-   author_email='arnenix@googlemail.com',
-   packages=['bias_transfer'],  #same as name
-   install_requires=[], #external packages as dependencies
-)
\ No newline at end of file
+    name="nntransfer",
+    version="0.1dev",
+    description="Framework for transfer experiments",
+    author="Arne Nix",
+    author_email="arnenix@googlemail.com",
+    packages=["nntransfer"],  # same as name
+    install_requires=[],  # external packages as dependencies
+)