From 611d974591b32f03a7dd1ef4c3c14355739d9aca Mon Sep 17 00:00:00 2001
From: Andrey Guzhov <andrey.guzhov@gmail.com>
Date: Mon, 28 Jun 2021 10:30:11 +0200
Subject: [PATCH] create repo

---
 .gitattributes                       |   2 +
 README.md                            |  47 ++
 assets/AudioCLIP-Full-Training.pt    |   3 +
 assets/AudioCLIP-Partial-Training.pt |   3 +
 assets/CLIP.pt                       |   3 +
 assets/ESRNXFBSP.pt                  |   3 +
 assets/README.md                     |   1 +
 assets/bpe_simple_vocab_16e6.txt.gz  |   3 +
 ignite_trainer/README.md             |   3 +
 ignite_trainer/__init__.py           |  16 +
 ignite_trainer/_interfaces.py        |  37 ++
 ignite_trainer/_trainer.py           | 763 +++++++++++++++++++++++++++
 ignite_trainer/_utils.py             | 221 ++++++++
 ignite_trainer/_visdom.py            | 191 +++++++
 ignite_trainer/version.py            |   1 +
 main.py                              |  11 +
 model/__init__.py                    |   3 +
 model/audioclip.py                   | 255 +++++++++
 model/clip/__init__.py               |   5 +
 model/clip/clip.py                   | 193 +++++++
 model/clip/model.py                  | 433 +++++++++++++++
 model/esresnet/__init__.py           |   8 +
 model/esresnet/attention.py          |  40 ++
 model/esresnet/base.py               | 708 +++++++++++++++++++++++++
 model/esresnet/fbsp.py               | 247 +++++++++
 protocols/README.md                  |   3 +
 protocols/audioclip-esc50.json       | 108 ++++
 protocols/audioclip-us8k.json        | 108 ++++
 requirements.txt                     |  11 +
 utils/__init__.py                    |   7 +
 utils/datasets/__init__.py           |   4 +
 utils/datasets/esc50.py              | 128 +++++
 utils/datasets/us8k.py               | 167 ++++++
 utils/simple_tokenizer.py            | 134 +++++
 utils/transforms.py                  | 199 +++++++
 35 files changed, 4069 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 README.md
 create mode 100644 assets/AudioCLIP-Full-Training.pt
 create mode 100644 assets/AudioCLIP-Partial-Training.pt
 create mode 100644 assets/CLIP.pt
 create mode 100644 assets/ESRNXFBSP.pt
 create mode 100644 assets/README.md
 create mode 100644 assets/bpe_simple_vocab_16e6.txt.gz
 create mode 100644 ignite_trainer/README.md
 create mode 100644 ignite_trainer/__init__.py
 create mode 100644 ignite_trainer/_interfaces.py
 create mode 100644 ignite_trainer/_trainer.py
 create mode 100644 ignite_trainer/_utils.py
 create mode 100644 ignite_trainer/_visdom.py
 create mode 100644 ignite_trainer/version.py
 create mode 100644 main.py
 create mode 100644 model/__init__.py
 create mode 100644 model/audioclip.py
 create mode 100644 model/clip/__init__.py
 create mode 100644 model/clip/clip.py
 create mode 100644 model/clip/model.py
 create mode 100644 model/esresnet/__init__.py
 create mode 100644 model/esresnet/attention.py
 create mode 100644 model/esresnet/base.py
 create mode 100644 model/esresnet/fbsp.py
 create mode 100644 protocols/README.md
 create mode 100644 protocols/audioclip-esc50.json
 create mode 100644 protocols/audioclip-us8k.json
 create mode 100644 requirements.txt
 create mode 100644 utils/__init__.py
 create mode 100644 utils/datasets/__init__.py
 create mode 100644 utils/datasets/esc50.py
 create mode 100644 utils/datasets/us8k.py
 create mode 100644 utils/simple_tokenizer.py
 create mode 100644 utils/transforms.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..bb29d8c
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..32f7bbd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,47 @@
+# AudioCLIP
+## Extending [CLIP](https://github.com/openai/CLIP) to Image, Text and Audio
+
+This repository contains implementation of the models described in the paper [arXiv:2106.13043](https://arxiv.org/abs/2106.13043).
+This work based on our previous works:
+* [ESResNe(X)t-fbsp: Learning Robust Time-Frequency Transformation of Audio (2021)](https://github.com/AndreyGuzhov/ESResNeXt-fbsp).
+* [ESResNet: Environmental Sound Classification Based on Visual Domain Models (2020)](https://github.com/AndreyGuzhov/ESResNet).
+
+### Abstract
+
+In the past, the rapidly evolving field of sound classification greatly benefited from the application of methods from other domains.
+Today, we observe the trend to fuse domain-specific tasks and approaches together, which provides the community with new outstanding models.
+
+In this work, we present an extension of the CLIP model that handles audio in addition to text and images.
+Our proposed model incorporates the ESResNeXt audio-model into the CLIP framework using the AudioSet dataset.
+Such a combination enables the proposed model to perform bimodal and unimodal classification and querying, while keeping CLIP's ability to generalize to unseen datasets in a zero-shot inference fashion.
+
+AudioCLIP achieves new state-of-the-art results in the Environmental Sound Classification (ESC) task, out-performing other approaches by reaching accuracies of 90.07% on the UrbanSound8K and 97.15% on the ESC-50 datasets.
+Further it sets new baselines in the zero-shot ESC-task on the same datasets (68.78% and 69.40%, respectively).
+
+Finally, we also assess the cross-modal querying performance of the proposed model as well as the influence of full and partial training on the results.
+For the sake of reproducibility, our code is published.
+
+### How to Run the Model
+
+The required Python version is >= 3.7.
+
+#### AudioCLIP
+
+##### On the [ESC-50](https://github.com/karolpiczak/ESC-50) dataset
+    python main.py --config protocols/audioclip-esc50.json --Dataset.args.root /path/to/ESC50
+
+##### On the [UrbanSound8K](https://urbansounddataset.weebly.com/) dataset
+    python main.py --config protocols/audioclip-us8k.json --Dataset.args.root /path/to/UrbanSound8K
+
+### Cite Us
+
+```
+@misc{guzhov2021audioclip,
+      title={AudioCLIP: Extending CLIP to Image, Text and Audio}, 
+      author={Andrey Guzhov and Federico Raue and Jörn Hees and Andreas Dengel},
+      year={2021},
+      eprint={2106.13043},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD}
+}
+```
diff --git a/assets/AudioCLIP-Full-Training.pt b/assets/AudioCLIP-Full-Training.pt
new file mode 100644
index 0000000..3d85a17
--- /dev/null
+++ b/assets/AudioCLIP-Full-Training.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2441d35b353352c8b1bbfb8f7c687f46314c3d2909e940eaf763b8c17f632c44
+size 537302068
diff --git a/assets/AudioCLIP-Partial-Training.pt b/assets/AudioCLIP-Partial-Training.pt
new file mode 100644
index 0000000..fc67480
--- /dev/null
+++ b/assets/AudioCLIP-Partial-Training.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1921885b6c3b5de0619f2ef4a9700ecbade758a398b34eb064230c30baef0c75
+size 537302068
diff --git a/assets/CLIP.pt b/assets/CLIP.pt
new file mode 100644
index 0000000..f5823d4
--- /dev/null
+++ b/assets/CLIP.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:124e6d59d54c0837c456c953b3147adf438a21570d5c5b01ad83f6ad3e78d15e
+size 408415159
diff --git a/assets/ESRNXFBSP.pt b/assets/ESRNXFBSP.pt
new file mode 100644
index 0000000..ec3bb0a
--- /dev/null
+++ b/assets/ESRNXFBSP.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acfcc6b0a0f07025cc660ed6dcf9b5a05fbd7c8ebf65edaa0a81d88a92ff6834
+size 124795647
diff --git a/assets/README.md b/assets/README.md
new file mode 100644
index 0000000..a84a522
--- /dev/null
+++ b/assets/README.md
@@ -0,0 +1 @@
+This folder contains snapshots of the pre-trained models.
diff --git a/assets/bpe_simple_vocab_16e6.txt.gz b/assets/bpe_simple_vocab_16e6.txt.gz
new file mode 100644
index 0000000..36a1585
--- /dev/null
+++ b/assets/bpe_simple_vocab_16e6.txt.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917
diff --git a/ignite_trainer/README.md b/ignite_trainer/README.md
new file mode 100644
index 0000000..c58920d
--- /dev/null
+++ b/ignite_trainer/README.md
@@ -0,0 +1,3 @@
+# Training Wrapper
+
+Utility code to run training and evaluation of the model.
diff --git a/ignite_trainer/__init__.py b/ignite_trainer/__init__.py
new file mode 100644
index 0000000..f57a10e
--- /dev/null
+++ b/ignite_trainer/__init__.py
@@ -0,0 +1,16 @@
+import os as _os
+import sys as _sys
+
+from ignite_trainer.version import __version__
+from ._trainer import main, run
+from ._utils import load_class
+from ._interfaces import AbstractNet, AbstractTransform
+
+__all__ = [
+    '__version__',
+    'main', 'run',
+    'load_class',
+    'AbstractNet', 'AbstractTransform'
+]
+
+_sys.path.extend([_os.getcwd()])
diff --git a/ignite_trainer/_interfaces.py b/ignite_trainer/_interfaces.py
new file mode 100644
index 0000000..882d425
--- /dev/null
+++ b/ignite_trainer/_interfaces.py
@@ -0,0 +1,37 @@
+import abc
+import torch
+
+from typing import Tuple
+from typing import Union
+from typing import Callable
+from typing import Optional
+
+
+TensorPair = Tuple[torch.Tensor, torch.Tensor]
+TensorOrTwo = Union[torch.Tensor, TensorPair]
+
+
+class AbstractNet(abc.ABC, torch.nn.Module):
+
+    @abc.abstractmethod
+    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> TensorOrTwo:
+        pass
+
+    @abc.abstractmethod
+    def loss_fn(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        pass
+
+    @property
+    @abc.abstractmethod
+    def loss_fn_name(self) -> str:
+        pass
+
+
+class AbstractTransform(abc.ABC, Callable[[torch.Tensor], torch.Tensor]):
+
+    @abc.abstractmethod
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        pass
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
diff --git a/ignite_trainer/_trainer.py b/ignite_trainer/_trainer.py
new file mode 100644
index 0000000..b78cac0
--- /dev/null
+++ b/ignite_trainer/_trainer.py
@@ -0,0 +1,763 @@
+﻿import io
+import os
+import glob
+import json
+import time
+import tqdm
+import signal
+import argparse
+import numpy as np
+
+import torch
+import torch.utils.data
+
+import torchvision as tv
+
+import ignite.engine as ieng
+import ignite.metrics as imet
+import ignite.handlers as ihan
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Type
+from typing import Union
+from typing import Optional
+
+from termcolor import colored
+
+from collections import defaultdict
+from collections.abc import Iterable
+
+from ignite_trainer import _utils
+from ignite_trainer import _visdom
+from ignite_trainer import _interfaces
+
+VISDOM_HOST = 'localhost'
+VISDOM_PORT = 8097
+VISDOM_ENV_PATH = os.path.join(os.path.expanduser('~'), 'logs')
+BATCH_TRAIN = 128
+BATCH_TEST = 1024
+WORKERS_TRAIN = 0
+WORKERS_TEST = 0
+EPOCHS = 100
+LOG_INTERVAL = 50
+SAVED_MODELS_PATH = os.path.join(os.path.expanduser('~'), 'saved_models')
+
+
+def run(experiment_name: str,
+        visdom_host: str,
+        visdom_port: int,
+        visdom_env_path: str,
+        model_class: str,
+        model_args: Dict[str, Any],
+        optimizer_class: str,
+        optimizer_args: Dict[str, Any],
+        dataset_class: str,
+        dataset_args: Dict[str, Any],
+        batch_train: int,
+        batch_test: int,
+        workers_train: int,
+        workers_test: int,
+        transforms: List[Dict[str, Union[str, Dict[str, Any]]]],
+        epochs: int,
+        log_interval: int,
+        saved_models_path: str,
+        performance_metrics: Optional = None,
+        scheduler_class: Optional[str] = None,
+        scheduler_args: Optional[Dict[str, Any]] = None,
+        model_suffix: Optional[str] = None,
+        setup_suffix: Optional[str] = None,
+        orig_stdout: Optional[io.TextIOBase] = None,
+        skip_train_val: bool = False):
+
+    with _utils.tqdm_stdout(orig_stdout) as orig_stdout:
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        num_gpus = torch.cuda.device_count()
+
+        if num_gpus > 1:
+            experiment_name = f'{experiment_name}-x{num_gpus}'
+
+        transforms_train = list()
+        transforms_test = list()
+
+        for idx, transform in enumerate(transforms):
+            use_train = transform.get('train', True)
+            use_test = transform.get('test', True)
+
+            transform = _utils.load_class(transform['class'])(**transform['args'])
+
+            if use_train:
+                transforms_train.append(transform)
+            if use_test:
+                transforms_test.append(transform)
+
+            transforms[idx]['train'] = use_train
+            transforms[idx]['test'] = use_test
+
+        transforms_train = tv.transforms.Compose(transforms_train)
+        transforms_test = tv.transforms.Compose(transforms_test)
+
+        Dataset: Type = _utils.load_class(dataset_class)
+
+        train_loader, eval_loader = _utils.get_data_loaders(
+            Dataset,
+            dataset_args,
+            batch_train,
+            batch_test,
+            workers_train,
+            workers_test,
+            transforms_train,
+            transforms_test
+        )
+
+        Network: Type = _utils.load_class(model_class)
+        model: _interfaces.AbstractNet = Network(**model_args)
+
+        if hasattr(train_loader.dataset, 'class_weights'):
+            model.register_buffer('class_weights', train_loader.dataset.class_weights.clone().exp(), persistent=False)
+        if hasattr(train_loader.dataset, 'label_to_class_idx'):
+            model.label_to_class_idx = {idx: lb for idx, lb in train_loader.dataset.label_to_class_idx.items()}
+
+        model = torch.nn.DataParallel(model, device_ids=range(num_gpus))
+        model = model.to(device)
+
+        # disable all parameters
+        for p in model.parameters():
+            p.requires_grad = False
+
+        # enable only audio-related parameters
+        for p in model.module.audio.parameters():
+            p.requires_grad = True
+
+        # disable fbsp-parameters
+        for p in model.module.audio.fbsp.parameters():
+            p.requires_grad = False
+
+        # disable logit scaling
+        model.module.logit_scale_ai.requires_grad = False
+        model.module.logit_scale_at.requires_grad = False
+
+        # add only enabled parameters to optimizer's list
+        param_groups = [
+            {'params': [p for p in model.module.parameters() if p.requires_grad]}
+        ]
+
+        # enable fbsp-parameters
+        for p in model.module.audio.fbsp.parameters():
+            p.requires_grad = True
+
+        # enable logit scaling
+        model.module.logit_scale_ai.requires_grad = True
+        model.module.logit_scale_at.requires_grad = True
+
+        # add fbsp- and logit scaling parameters to a separate group without weight decay
+        param_groups.append({
+            'params': [
+                p for p in model.module.audio.fbsp.parameters()
+            ] + [
+                model.module.logit_scale_ai,
+                model.module.logit_scale_at
+            ],
+            'weight_decay': 0.0
+        })
+
+        Optimizer: Type = _utils.load_class(optimizer_class)
+        optimizer: torch.optim.Optimizer = Optimizer(
+            param_groups,
+            **{**optimizer_args, **{'lr': optimizer_args['lr'] * num_gpus}}
+        )
+
+        if scheduler_class is not None:
+            Scheduler: Type = _utils.load_class(scheduler_class)
+
+            if scheduler_args is None:
+                scheduler_args = dict()
+
+            scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = Scheduler(optimizer, **scheduler_args)
+        else:
+            scheduler = None
+
+        model_short_name = ''.join([c for c in Network.__name__ if c == c.upper()])
+        model_name = '{}{}'.format(
+            model_short_name,
+            '-{}'.format(model_suffix) if model_suffix is not None else ''
+        )
+        visdom_env_name = '{}_{}_{}{}'.format(
+            Dataset.__name__,
+            experiment_name,
+            model_name,
+            '-{}'.format(setup_suffix) if setup_suffix is not None else ''
+        )
+
+        vis, vis_pid = _visdom.get_visdom_instance(visdom_host, visdom_port, visdom_env_name, visdom_env_path)
+
+        prog_bar_epochs = tqdm.tqdm(total=epochs, desc='Epochs', file=orig_stdout, dynamic_ncols=True, unit='epoch')
+        prog_bar_iters = tqdm.tqdm(desc='Batches', file=orig_stdout, dynamic_ncols=True)
+
+        num_params_total = sum(p.numel() for p in model.parameters())
+        num_params_train = sum(p.numel() for grp in optimizer.param_groups for p in grp['params'])
+
+        params_total_label = ''
+        params_train_label = ''
+        if num_params_total > 1e6:
+            num_params_total /= 1e6
+            params_total_label = 'M'
+        elif num_params_total > 1e3:
+            num_params_total /= 1e3
+            params_total_label = 'k'
+
+        if num_params_train > 1e6:
+            num_params_train /= 1e6
+            params_train_label = 'M'
+        elif num_params_train > 1e3:
+            num_params_train /= 1e3
+            params_train_label = 'k'
+
+        tqdm.tqdm.write(f'\n{Network.__name__}\n')
+        tqdm.tqdm.write('Total number of parameters: {:.2f}{}'.format(num_params_total, params_total_label))
+        tqdm.tqdm.write('Number of trainable parameters: {:.2f}{}'.format(num_params_train, params_train_label))
+
+        def training_step(engine: ieng.Engine, batch) -> torch.Tensor:
+            model.train()
+            model.epoch = engine.state.epoch
+            model.batch_idx = (engine.state.iteration - 1) % len(train_loader)
+            model.num_batches = len(train_loader)
+
+            optimizer.zero_grad()
+
+            audio, image, text = batch
+            if audio is not None:
+                audio = audio.to(device)
+            if image is not None:
+                image = image.to(device)
+
+            batch_indices = torch.arange(audio.shape[0], dtype=torch.int64, device=device)
+            _, loss = model(audio, image, text, batch_indices)
+
+            if loss.ndim > 0:
+                loss = loss.mean()
+
+            loss.backward(retain_graph=False)
+            optimizer.step(None)
+
+            return loss.item()
+
+        def eval_step(_: ieng.Engine, batch) -> _interfaces.TensorPair:
+            model.eval()
+
+            with torch.no_grad():
+                audio, _, text = batch
+
+                ((audio_features, _, _), _), _ = model(
+                    audio=audio,
+                    batch_indices=torch.arange(audio.shape[0], dtype=torch.int64, device=device)
+                )
+                audio_features = audio_features.unsqueeze(1)
+
+                ((_, _, text_features), _), _ = model(
+                    text=[
+                        [eval_loader.dataset.class_idx_to_label[class_idx]]
+                        for class_idx in sorted(eval_loader.dataset.class_idx_to_label.keys())
+                    ],
+                    batch_indices=torch.arange(
+                        len(eval_loader.dataset.class_idx_to_label), dtype=torch.int64, device=device
+                    )
+                )
+                text_features = text_features.unsqueeze(1).transpose(0, 1)
+
+                logit_scale_at = torch.clamp(model.module.logit_scale_at.exp(), min=1.0, max=100.0)
+                y_pred = (logit_scale_at * audio_features @ text_features.transpose(-1, -2)).squeeze(1)
+
+                y = torch.zeros(
+                    audio.shape[0], len(eval_loader.dataset.class_idx_to_label), dtype=torch.int8, device=device
+                )
+                for item_idx, labels in enumerate(text):
+                    class_ids = list(sorted([
+                        eval_loader.dataset.label_to_class_idx[lb] for lb in labels
+                    ]))
+                    y[item_idx][class_ids] = 1
+
+                if model.module.multilabel:
+                    y_pred = torch.sigmoid(y_pred / logit_scale_at - 0.5)
+                else:
+                    y_pred = torch.softmax(y_pred, dim=-1)
+                    y = y.argmax(dim=-1)
+
+            return y_pred, y
+
+        trainer = ieng.Engine(training_step)
+        validator_train = ieng.Engine(eval_step)
+        validator_eval = ieng.Engine(eval_step)
+
+        # placeholder for summary window
+        vis.text(
+            text='',
+            win=experiment_name,
+            env=visdom_env_name,
+            opts={'title': 'Summary', 'width': 940, 'height': 416},
+            append=vis.win_exists(experiment_name, visdom_env_name)
+        )
+
+        default_metrics = {
+            "Loss": {
+                "window_name": None,
+                "x_label": "#Epochs",
+                "y_label": model.loss_fn_name if not isinstance(model, torch.nn.DataParallel) else model.module.loss_fn_name,
+                "width": 940,
+                "height": 416,
+                "lines": [
+                    {
+                        "line_label": "SMA",
+                        "object": imet.RunningAverage(output_transform=lambda x: x),
+                        "test": False,
+                        "update_rate": "iteration"
+                    }
+                ]
+            }
+        }
+
+        performance_metrics = {**default_metrics, **performance_metrics}
+        checkpoint_metrics = list()
+
+        for scope_name, scope in performance_metrics.items():
+            scope['window_name'] = scope.get('window_name', scope_name) or scope_name
+
+            for line in scope['lines']:
+                if 'object' not in line:
+                    line['object']: imet.Metric = _utils.load_class(line['class'])(**line['args'])
+
+                line['metric_label'] = '{}: {}'.format(scope['window_name'], line['line_label'])
+
+                line['update_rate'] = line.get('update_rate', 'epoch')
+                line_suffixes = list()
+                if line['update_rate'] == 'iteration':
+                    line['object'].attach(trainer, line['metric_label'])
+                    line['train'] = False
+                    line['test'] = False
+
+                    line_suffixes.append(' Train.')
+
+                if line.get('train', True):
+                    line['object'].attach(validator_train, line['metric_label'])
+                    line_suffixes.append(' Train.')
+                if line.get('test', True):
+                    line['object'].attach(validator_eval, line['metric_label'])
+                    line_suffixes.append(' Eval.')
+
+                    if line.get('is_checkpoint', False):
+                        checkpoint_metrics.append(line['metric_label'])
+
+                for line_suffix in line_suffixes:
+                    _visdom.plot_line(
+                        vis=vis,
+                        window_name=scope['window_name'],
+                        env=visdom_env_name,
+                        line_label=line['line_label'] + line_suffix,
+                        x_label=scope['x_label'],
+                        y_label=scope['y_label'],
+                        width=scope['width'],
+                        height=scope['height'],
+                        draw_marker=(line['update_rate'] == 'epoch')
+                    )
+
+        if checkpoint_metrics:
+            score_name = 'performance'
+
+            def get_score(engine: ieng.Engine) -> float:
+                current_mode = getattr(engine.state.dataloader.iterable.dataset, dataset_args['training']['key'])
+                val_mode = dataset_args['training']['no']
+
+                score = 0.0
+                if current_mode == val_mode:
+                    for metric_name in checkpoint_metrics:
+                        try:
+                            score += engine.state.metrics[metric_name]
+                        except KeyError:
+                            pass
+
+                return score
+
+            model_saver = ihan.ModelCheckpoint(
+                os.path.join(saved_models_path, visdom_env_name),
+                filename_prefix=visdom_env_name,
+                score_name=score_name,
+                score_function=get_score,
+                n_saved=3,
+                save_as_state_dict=True,
+                require_empty=False,
+                create_dir=True
+            )
+
+            validator_eval.add_event_handler(ieng.Events.EPOCH_COMPLETED, model_saver, {model_name: model})
+
+        if not skip_train_val:
+            @trainer.on(ieng.Events.STARTED)
+            def engine_started(engine: ieng.Engine):
+                log_validation(engine, False)
+
+        @trainer.on(ieng.Events.EPOCH_STARTED)
+        def reset_progress_iterations(engine: ieng.Engine):
+            prog_bar_iters.clear()
+            prog_bar_iters.n = 0
+            prog_bar_iters.last_print_n = 0
+            prog_bar_iters.start_t = time.time()
+            prog_bar_iters.last_print_t = time.time()
+            prog_bar_iters.total = len(engine.state.dataloader)
+
+        @trainer.on(ieng.Events.ITERATION_COMPLETED)
+        def log_training(engine: ieng.Engine):
+            prog_bar_iters.update(1)
+
+            num_iter = (engine.state.iteration - 1) % len(train_loader) + 1
+
+            early_stop = np.isnan(engine.state.output) or np.isinf(engine.state.output)
+
+            if num_iter % log_interval == 0 or num_iter == len(train_loader) or early_stop:
+                tqdm.tqdm.write(
+                    'Epoch[{}] Iteration[{}/{}] Loss: {:.4f}'.format(
+                        engine.state.epoch, num_iter, len(train_loader), engine.state.output
+                    )
+                )
+
+                x_pos = engine.state.epoch + num_iter / len(train_loader) - 1
+                for scope_name, scope in performance_metrics.items():
+                    for line in scope['lines']:
+                        if line['update_rate'] == 'iteration':
+                            line_label = '{} Train.'.format(line['line_label'])
+                            line_value = engine.state.metrics[line['metric_label']]
+
+                            if engine.state.epoch >= 1:
+                                _visdom.plot_line(
+                                    vis=vis,
+                                    window_name=scope['window_name'],
+                                    env=visdom_env_name,
+                                    line_label=line_label,
+                                    x_label=scope['x_label'],
+                                    y_label=scope['y_label'],
+                                    x=np.full(1, x_pos),
+                                    y=np.full(1, line_value)
+                                )
+
+            if early_stop:
+                tqdm.tqdm.write(colored('Early stopping due to invalid loss value.', 'red'))
+                trainer.terminate()
+
+        def log_validation(engine: ieng.Engine,
+                           train: bool = True):
+
+            if train:
+                run_type = 'Train.'
+                data_loader = train_loader
+                validator = validator_train
+            else:
+                run_type = 'Eval.'
+                data_loader = eval_loader
+                validator = validator_eval
+
+            prog_bar_validation = tqdm.tqdm(
+                data_loader,
+                desc=f'Validation {run_type}',
+                file=orig_stdout,
+                dynamic_ncols=True,
+                leave=False
+            )
+            validator.run(prog_bar_validation)
+            prog_bar_validation.clear()
+            prog_bar_validation.close()
+
+            tqdm_info = [
+                'Epoch: {}'.format(engine.state.epoch)
+            ]
+            for scope_name, scope in performance_metrics.items():
+                for line in scope['lines']:
+                    if line['update_rate'] == 'epoch':
+                        try:
+                            line_label = '{} {}'.format(line['line_label'], run_type)
+                            line_value = validator.state.metrics[line['metric_label']]
+
+                            _visdom.plot_line(
+                                vis=vis,
+                                window_name=scope['window_name'],
+                                env=visdom_env_name,
+                                line_label=line_label,
+                                x_label=scope['x_label'],
+                                y_label=scope['y_label'],
+                                x=np.full(1, engine.state.epoch),
+                                y=np.full(1, line_value),
+                                draw_marker=True
+                            )
+
+                            tqdm_info.append('{}: {:.4f}'.format(line_label, line_value))
+                        except KeyError:
+                            pass
+
+            tqdm.tqdm.write('{} results - {}'.format(run_type, '; '.join(tqdm_info)))
+
+        if not skip_train_val:
+            @trainer.on(ieng.Events.EPOCH_COMPLETED)
+            def log_validation_train(engine: ieng.Engine):
+                log_validation(engine, True)
+
+        @trainer.on(ieng.Events.EPOCH_COMPLETED)
+        def log_validation_eval(engine: ieng.Engine):
+            log_validation(engine, False)
+
+            if engine.state.epoch == 1:
+                summary = _utils.build_summary_str(
+                    experiment_name=experiment_name,
+                    model_short_name=model_name,
+                    model_class=model_class,
+                    model_args=model_args,
+                    optimizer_class=optimizer_class,
+                    optimizer_args=optimizer_args,
+                    dataset_class=dataset_class,
+                    dataset_args=dataset_args,
+                    transforms=transforms,
+                    epochs=epochs,
+                    batch_train=batch_train,
+                    log_interval=log_interval,
+                    saved_models_path=saved_models_path,
+                    scheduler_class=scheduler_class,
+                    scheduler_args=scheduler_args
+                )
+                _visdom.create_summary_window(
+                    vis=vis,
+                    visdom_env_name=visdom_env_name,
+                    experiment_name=experiment_name,
+                    summary=summary
+                )
+
+            vis.save([visdom_env_name])
+
+            prog_bar_epochs.update(1)
+
+            if scheduler is not None:
+                scheduler.step(engine.state.epoch)
+
+        trainer.run(train_loader, max_epochs=epochs)
+
+        if vis_pid is not None:
+            tqdm.tqdm.write('Stopping visdom')
+            os.kill(vis_pid, signal.SIGTERM)
+
+        del vis
+        del train_loader
+        del eval_loader
+
+        prog_bar_iters.clear()
+        prog_bar_iters.close()
+
+        prog_bar_epochs.clear()
+        prog_bar_epochs.close()
+
+    tqdm.tqdm.write('\n')
+
+
+def main():
+    with _utils.tqdm_stdout() as orig_stdout:
+        parser = argparse.ArgumentParser()
+
+        parser.add_argument('-c', '--config', type=str, required=True)
+        parser.add_argument('-H', '--visdom-host', type=str, required=False)
+        parser.add_argument('-P', '--visdom-port', type=int, required=False)
+        parser.add_argument('-E', '--visdom-env-path', type=str, required=False)
+        parser.add_argument('-b', '--batch-train', type=int, required=False)
+        parser.add_argument('-B', '--batch-test', type=int, required=False)
+        parser.add_argument('-w', '--workers-train', type=int, required=False)
+        parser.add_argument('-W', '--workers-test', type=int, required=False)
+        parser.add_argument('-e', '--epochs', type=int, required=False)
+        parser.add_argument('-L', '--log-interval', type=int, required=False)
+        parser.add_argument('-M', '--saved-models-path', type=str, required=False)
+        parser.add_argument('-R', '--random-seed', type=int, required=False)
+        parser.add_argument('-s', '--suffix', type=str, required=False)
+        parser.add_argument('-S', '--skip-train-val', action='store_true', default=False)
+
+        args, unknown_args = parser.parse_known_args()
+
+        if args.batch_test is None:
+            args.batch_test = args.batch_train
+
+        if args.random_seed is not None:
+            args.suffix = '{}r-{}'.format(
+                '{}_'.format(args.suffix) if args.suffix is not None else '',
+                args.random_seed
+            )
+
+            np.random.seed(args.random_seed)
+            torch.random.manual_seed(args.random_seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(args.random_seed)
+                torch.backends.cudnn.deterministic = True
+                torch.backends.cudnn.benchmark = False
+
+        configs_found = list(sorted(glob.glob(os.path.expanduser(args.config))))
+        prog_bar_exps = tqdm.tqdm(
+            configs_found,
+            desc='Experiments',
+            unit='setup',
+            file=orig_stdout,
+            dynamic_ncols=True
+        )
+
+        for config_path in prog_bar_exps:
+            config = json.load(open(config_path))
+
+            if unknown_args:
+                tqdm.tqdm.write('\nParsing additional arguments...')
+
+            args_not_found = list()
+            for arg in unknown_args:
+                if arg.startswith('--'):
+                    keys = arg.strip('-').split('.')
+
+                    section = config
+                    found = True
+                    for key in keys:
+                        if key in section:
+                            section = section[key]
+                        else:
+                            found = False
+                            break
+
+                    if found:
+                        override_parser = argparse.ArgumentParser()
+
+                        section_nargs = None
+                        section_type = type(section) if section is not None else str
+
+                        if section_type is bool:
+                            if section_type is bool:
+                                def infer_bool(x: str) -> bool:
+                                    return x.lower() not in ('0', 'false', 'no')
+
+                                section_type = infer_bool
+
+                        if isinstance(section, Iterable) and section_type is not str:
+                            section_nargs = '+'
+                            section_type = {type(value) for value in section}
+
+                            if len(section_type) == 1:
+                                section_type = section_type.pop()
+                            else:
+                                section_type = str
+
+                        override_parser.add_argument(arg, nargs=section_nargs, type=section_type)
+                        overridden_args, _ = override_parser.parse_known_args(unknown_args)
+                        overridden_args = vars(overridden_args)
+
+                        overridden_key = arg.strip('-')
+                        overriding_value = overridden_args[overridden_key]
+
+                        section = config
+                        old_value = None
+                        for i, key in enumerate(keys, 1):
+                            if i == len(keys):
+                                old_value = section[key]
+                                section[key] = overriding_value
+                            else:
+                                section = section[key]
+
+                        tqdm.tqdm.write(
+                            colored(f'Overriding "{overridden_key}": {old_value} -> {overriding_value}', 'magenta')
+                        )
+                    else:
+                        args_not_found.append(arg)
+
+            if args_not_found:
+                tqdm.tqdm.write(
+                    colored(
+                        '\nThere are unrecognized arguments to override: {}'.format(
+                            ', '.join(args_not_found)
+                        ),
+                        'red'
+                    )
+                )
+
+            config = defaultdict(None, config)
+
+            experiment_name = config['Setup']['name']
+
+            visdom_host = _utils.arg_selector(
+                args.visdom_host, config['Visdom']['host'], VISDOM_HOST
+            )
+            visdom_port = int(_utils.arg_selector(
+                args.visdom_port, config['Visdom']['port'], VISDOM_PORT
+            ))
+            visdom_env_path = _utils.arg_selector(
+                args.visdom_env_path, config['Visdom']['env_path'], VISDOM_ENV_PATH
+            )
+            batch_train = int(_utils.arg_selector(
+                args.batch_train, config['Setup']['batch_train'], BATCH_TRAIN
+            ))
+            batch_test = int(_utils.arg_selector(
+                args.batch_test, config['Setup']['batch_test'], BATCH_TEST
+            ))
+            workers_train = _utils.arg_selector(
+                args.workers_train, config['Setup']['workers_train'], WORKERS_TRAIN
+            )
+            workers_test = _utils.arg_selector(
+                args.workers_test, config['Setup']['workers_test'], WORKERS_TEST
+            )
+            epochs = _utils.arg_selector(
+                args.epochs, config['Setup']['epochs'], EPOCHS
+            )
+            log_interval = _utils.arg_selector(
+                args.log_interval, config['Setup']['log_interval'], LOG_INTERVAL
+            )
+            saved_models_path = _utils.arg_selector(
+                args.saved_models_path, config['Setup']['saved_models_path'], SAVED_MODELS_PATH
+            )
+
+            model_class = config['Model']['class']
+            model_args = config['Model']['args']
+
+            optimizer_class = config['Optimizer']['class']
+            optimizer_args = config['Optimizer']['args']
+
+            if 'Scheduler' in config:
+                scheduler_class = config['Scheduler']['class']
+                scheduler_args = config['Scheduler']['args']
+            else:
+                scheduler_class = None
+                scheduler_args = None
+
+            dataset_class = config['Dataset']['class']
+            dataset_args = config['Dataset']['args']
+
+            transforms = config['Transforms']
+            performance_metrics = config['Metrics']
+
+            tqdm.tqdm.write(f'\nStarting experiment "{experiment_name}"\n')
+
+            run(
+                experiment_name=experiment_name,
+                visdom_host=visdom_host,
+                visdom_port=visdom_port,
+                visdom_env_path=visdom_env_path,
+                model_class=model_class,
+                model_args=model_args,
+                optimizer_class=optimizer_class,
+                optimizer_args=optimizer_args,
+                dataset_class=dataset_class,
+                dataset_args=dataset_args,
+                batch_train=batch_train,
+                batch_test=batch_test,
+                workers_train=workers_train,
+                workers_test=workers_test,
+                transforms=transforms,
+                epochs=epochs,
+                log_interval=log_interval,
+                saved_models_path=saved_models_path,
+                performance_metrics=performance_metrics,
+                scheduler_class=scheduler_class,
+                scheduler_args=scheduler_args,
+                model_suffix=config['Setup']['suffix'],
+                setup_suffix=args.suffix,
+                orig_stdout=orig_stdout,
+                skip_train_val=args.skip_train_val
+            )
+
+        prog_bar_exps.close()
+
+    tqdm.tqdm.write('\n')
diff --git a/ignite_trainer/_utils.py b/ignite_trainer/_utils.py
new file mode 100644
index 0000000..25f7e11
--- /dev/null
+++ b/ignite_trainer/_utils.py
@@ -0,0 +1,221 @@
+import io
+import sys
+import json
+import tqdm
+import datetime
+import importlib
+import contextlib
+
+import numpy as np
+
+import torch
+import torch.utils.data as td
+
+import torchvision as tv
+
+from PIL import Image
+
+from collections import OrderedDict
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Type
+from typing import Tuple
+from typing import Union
+from typing import Callable
+from typing import Optional
+
+
+@contextlib.contextmanager
+def tqdm_stdout(orig_stdout: Optional[io.TextIOBase] = None):
+
+    class DummyFile(object):
+        file = None
+
+        def __init__(self, file):
+            self.file = file
+
+        def write(self, x):
+            if len(x.rstrip()) > 0:
+                tqdm.tqdm.write(x, file=self.file)
+
+        def flush(self):
+            return getattr(self.file, 'flush', lambda: None)()
+
+    orig_out_err = sys.stdout, sys.stderr
+
+    try:
+        if orig_stdout is None:
+            sys.stdout, sys.stderr = map(DummyFile, orig_out_err)
+            yield orig_out_err[0]
+        else:
+            yield orig_stdout
+    except Exception as exc:
+        raise exc
+    finally:
+        sys.stdout, sys.stderr = orig_out_err
+
+
+def load_class(package_name: str, class_name: Optional[str] = None) -> Type:
+    if class_name is None:
+        package_name, class_name = package_name.rsplit('.', 1)
+
+    importlib.invalidate_caches()
+
+    package = importlib.import_module(package_name)
+    cls = getattr(package, class_name)
+
+    return cls
+
+
+def arg_selector(arg_cmd: Optional[Any], arg_conf: Optional[Any], arg_const: Any) -> Any:
+    if arg_cmd is not None:
+        return arg_cmd
+    else:
+        if arg_conf is not None:
+            return arg_conf
+        else:
+            return arg_const
+
+
+def collate_fn(batch):
+    batch_audio, batch_image, batch_text = zip(*batch)
+
+    keep_ids = [idx for idx, (_, _) in enumerate(zip(batch_audio, batch_image))]
+
+    if not all(audio is None for audio in batch_audio):
+        batch_audio = [batch_audio[idx] for idx in keep_ids]
+        batch_audio = torch.stack(batch_audio)
+    else:
+        batch_audio = None
+
+    if not all(image is None for image in batch_image):
+        batch_image = [batch_image[idx] for idx in keep_ids]
+        batch_image = torch.stack(batch_image)
+    else:
+        batch_image = None
+
+    if not all(text is None for text in batch_text):
+        batch_text = [batch_text[idx] for idx in keep_ids]
+    else:
+        batch_text = None
+
+    return batch_audio, batch_image, batch_text
+
+
+def get_data_loaders(Dataset: Type,
+                     dataset_args: Dict[str, Any],
+                     batch_train: int = 64,
+                     batch_test: int = 1024,
+                     workers_train: int = 0,
+                     workers_test: int = 0,
+                     transforms_train: Optional[Callable[
+                         [Union[np.ndarray, torch.Tensor]], Union[np.ndarray, torch.Tensor]
+                     ]] = None,
+                     transforms_test: Optional[Callable[
+                         [Union[np.ndarray, torch.Tensor]], Union[np.ndarray, torch.Tensor]
+                     ]] = None) -> Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
+
+    dl_shuffle = dataset_args.pop('dl_shuffle', True)
+
+    dataset_mode_train = {dataset_args['training']['key']: dataset_args['training']['yes']}
+    dataset_mode_test = {dataset_args['training']['key']: dataset_args['training']['no']}
+
+    dataset_args_train = {**{k: v for k, v in dataset_args.items() if k != 'training'}, **dataset_mode_train}
+    dataset_args_test = {**{k: v for k, v in dataset_args.items() if k != 'training'}, **dataset_mode_test}
+
+    ds_train = Dataset(**{
+        **dataset_args_train,
+        **{'transform_audio': transforms_train},
+        **{'transform_frames': tv.transforms.Compose([
+            tv.transforms.ToTensor(),
+            tv.transforms.Resize(224, interpolation=Image.BICUBIC),
+            tv.transforms.CenterCrop(224),
+            tv.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+        ])}
+    })
+    train_loader = torch.utils.data.DataLoader(
+        ds_train,
+        batch_size=batch_train,
+        shuffle=dl_shuffle,
+        num_workers=workers_train,
+        pin_memory=True,
+        collate_fn=collate_fn,
+        drop_last=True
+    )
+    ds_eval = Dataset(**{
+        **dataset_args_test,
+        **{'transform_audio': transforms_test},
+        **{'transform_frames': tv.transforms.Compose([
+            tv.transforms.ToTensor(),
+            tv.transforms.Resize(224, interpolation=Image.BICUBIC),
+            tv.transforms.CenterCrop(224),
+            tv.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+        ])}
+    })
+    eval_loader = torch.utils.data.DataLoader(
+        ds_eval,
+        batch_size=batch_test,
+        num_workers=workers_test,
+        pin_memory=True,
+        collate_fn=collate_fn
+    )
+
+    return train_loader, eval_loader
+
+
+def build_summary_str(experiment_name: str,
+                      model_short_name: str,
+                      model_class: str,
+                      model_args: Dict[str, Any],
+                      optimizer_class: str,
+                      optimizer_args: Dict[str, Any],
+                      dataset_class: str,
+                      dataset_args: Dict[str, Any],
+                      transforms: List[Dict[str, Union[str, Dict[str, Any]]]],
+                      epochs: int,
+                      batch_train: int,
+                      log_interval: int,
+                      saved_models_path: str,
+                      scheduler_class: Optional[str] = None,
+                      scheduler_args: Optional[Dict[str, Any]] = None) -> str:
+
+    setup_title = '{}-{}'.format(experiment_name, model_short_name)
+
+    summary_window_text = '<h3>'
+    summary_window_text += '<a style="cursor: pointer;" onclick="jQuery(\'#{}\').toggle()">'.format(setup_title)
+
+    summary_window_text += setup_title
+
+    summary_window_text += '</a>'
+    summary_window_text += '</h3>'
+    summary_window_text += '<div style="margin: 5px; padding: 5px; background-color: lightgray;">'
+    summary_window_text += '<div id="{}" style="display: none;"><pre>'.format(setup_title)
+
+    summary = OrderedDict({
+        'Date started': datetime.datetime.now().strftime('%Y-%m-%d @ %H:%M:%S'),
+        'Model': OrderedDict({model_class: model_args}),
+        'Setup': OrderedDict({
+            'epochs': epochs,
+            'batch': batch_train,
+            'log_interval': log_interval,
+            'saved_models_path': saved_models_path
+        }),
+        'Optimizer': OrderedDict({optimizer_class: optimizer_args}),
+        'Dataset': OrderedDict({dataset_class: dataset_args}),
+        'Transforms': OrderedDict({
+            'Training': OrderedDict({tr['class']: tr['args'] for tr in transforms if tr['train']}),
+            'Validation': OrderedDict({tr['class']: tr['args'] for tr in transforms if tr['test']})
+        })
+    })
+    if scheduler_class is not None:
+        summary['Scheduler'] = {scheduler_class: scheduler_args}
+    summary_window_text += '{}'.format(
+        json.dumps(summary, indent=2)
+    )
+
+    summary_window_text += '</pre></div>'
+    summary_window_text += '</div>'
+
+    return summary_window_text
diff --git a/ignite_trainer/_visdom.py b/ignite_trainer/_visdom.py
new file mode 100644
index 0000000..9d2ffa6
--- /dev/null
+++ b/ignite_trainer/_visdom.py
@@ -0,0 +1,191 @@
+import os
+import sys
+import json
+import time
+import tqdm
+import socket
+import subprocess
+import numpy as np
+
+import visdom
+
+from typing import Tuple
+from typing import Optional
+
+
+def calc_ytick_range(vis: visdom.Visdom, window_name: str, env: Optional[str] = None) -> Tuple[float, float]:
+    lower_bound, upper_bound = -1.0, 1.0
+
+    stats = vis.get_window_data(win=window_name, env=env)
+
+    if stats:
+        stats = json.loads(stats)
+
+        stats = [np.array(item['y']) for item in stats['content']['data']]
+        stats = [item[item != np.array([None])].astype(np.float16) for item in stats]
+
+        if stats:
+            q25s = np.array([np.quantile(item, 0.25) for item in stats if len(item) > 0])
+            q75s = np.array([np.quantile(item, 0.75) for item in stats if len(item) > 0])
+
+            if q25s.shape == q75s.shape and len(q25s) > 0:
+                iqrs = q75s - q25s
+
+                lower_bounds = q25s - 1.5 * iqrs
+                upper_bounds = q75s + 1.5 * iqrs
+
+                stats_sanitized = list()
+                idx = 0
+                for item in stats:
+                    if len(item) > 0:
+                        item_sanitized = item[(item >= lower_bounds[idx]) & (item <= upper_bounds[idx])]
+                        stats_sanitized.append(item_sanitized)
+
+                        idx += 1
+
+                stats_sanitized = np.array(stats_sanitized)
+
+                q25_sanitized = np.array([np.quantile(item, 0.25) for item in stats_sanitized])
+                q75_sanitized = np.array([np.quantile(item, 0.75) for item in stats_sanitized])
+
+                iqr_sanitized = np.sum(q75_sanitized - q25_sanitized)
+                lower_bound = np.min(q25_sanitized) - 1.5 * iqr_sanitized
+                upper_bound = np.max(q75_sanitized) + 1.5 * iqr_sanitized
+
+    return lower_bound, upper_bound
+
+
+def plot_line(vis: visdom.Visdom,
+              window_name: str,
+              env: Optional[str] = None,
+              line_label: Optional[str] = None,
+              x: Optional[np.ndarray] = None,
+              y: Optional[np.ndarray] = None,
+              x_label: Optional[str] = None,
+              y_label: Optional[str] = None,
+              width: int = 576,
+              height: int = 416,
+              draw_marker: bool = False) -> str:
+
+    empty_call = not vis.win_exists(window_name)
+
+    if empty_call and (x is not None or y is not None):
+        return window_name
+
+    if x is None:
+        x = np.ones(1)
+        empty_call = empty_call & True
+
+    if y is None:
+        y = np.full(1, np.nan)
+        empty_call = empty_call & True
+
+        if x.shape != y.shape:
+            x = np.ones_like(y)
+
+    opts = {
+        'showlegend': True,
+        'markers': draw_marker,
+        'markersize': 5,
+    }
+
+    if empty_call:
+        opts['title'] = window_name
+        opts['width'] = width
+        opts['height'] = height
+
+    window_name = vis.line(
+        X=x,
+        Y=y,
+        win=window_name,
+        env=env,
+        update='append',
+        name=line_label,
+        opts=opts
+    )
+
+    xtickmin, xtickmax = 0.0, np.max(x) * 1.05
+    ytickmin, ytickmax = calc_ytick_range(vis, window_name, env)
+
+    opts = {
+        'showlegend': True,
+        'xtickmin': xtickmin,
+        'xtickmax': xtickmax,
+        'ytickmin': ytickmin,
+        'ytickmax': ytickmax,
+        'xlabel': x_label,
+        'ylabel': y_label
+    }
+
+    window_name = vis.update_window_opts(win=window_name, opts=opts, env=env)
+
+    return window_name
+
+
+def create_summary_window(vis: visdom.Visdom,
+                          visdom_env_name: str,
+                          experiment_name: str,
+                          summary: str) -> str:
+
+    return vis.text(
+        text=summary,
+        win=experiment_name,
+        env=visdom_env_name,
+        opts={'title': 'Summary', 'width': 576, 'height': 416},
+        append=vis.win_exists(experiment_name, visdom_env_name)
+    )
+
+
+def connection_is_alive(host: str, port: int) -> bool:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+        try:
+            sock.connect((host, port))
+            sock.shutdown(socket.SHUT_RDWR)
+
+            return True
+        except socket.error:
+            return False
+
+
+def get_visdom_instance(host: str = 'localhost',
+                        port: int = 8097,
+                        env_name: str = 'main',
+                        env_path: str = 'visdom_env') -> Tuple[visdom.Visdom, Optional[int]]:
+
+    vis_pid = None
+
+    if not connection_is_alive(host, port):
+        if any(host.strip('/').endswith(lh) for lh in ['127.0.0.1', 'localhost']):
+            os.makedirs(env_path, exist_ok=True)
+
+            tqdm.tqdm.write('Starting visdom on port {}'.format(port), end='')
+
+            vis_args = [
+                sys.executable,
+                '-m', 'visdom.server',
+                '-port', str(port),
+                '-env_path', os.path.join(os.getcwd(), env_path)
+            ]
+            vis_proc = subprocess.Popen(vis_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            time.sleep(2.0)
+
+            vis_pid = vis_proc.pid
+            tqdm.tqdm.write('PID -> {}'.format(vis_pid))
+
+    trials_left = 5
+    while not connection_is_alive(host, port):
+        time.sleep(1.0)
+
+        tqdm.tqdm.write('Trying to connect ({} left)...'.format(trials_left))
+
+        trials_left -= 1
+        if trials_left < 1:
+            raise RuntimeError('Visdom server is not running. Please run "python -m visdom.server".')
+
+    vis = visdom.Visdom(
+        server='http://{}'.format(host),
+        port=port,
+        env=env_name
+    )
+
+    return vis, vis_pid
diff --git a/ignite_trainer/version.py b/ignite_trainer/version.py
new file mode 100644
index 0000000..c73c6e0
--- /dev/null
+++ b/ignite_trainer/version.py
@@ -0,0 +1 @@
+__version__ = '0.2.5b5'
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..821b1c2
--- /dev/null
+++ b/main.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3.7
+
+import ignite_trainer as it
+
+
+def main():
+    it.main()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model/__init__.py b/model/__init__.py
new file mode 100644
index 0000000..956e839
--- /dev/null
+++ b/model/__init__.py
@@ -0,0 +1,3 @@
+from .clip import *
+from .esresnet import *
+from .audioclip import AudioCLIP
diff --git a/model/audioclip.py b/model/audioclip.py
new file mode 100644
index 0000000..9dd42d7
--- /dev/null
+++ b/model/audioclip.py
@@ -0,0 +1,255 @@
+import os
+
+import torch
+import torch.nn.functional as F
+
+from model.clip import CLIP
+from model.clip.clip import tokenize
+from model.esresnet import ESResNeXtFBSP
+
+from typing import List
+from typing import Tuple
+from typing import Union
+from typing import Optional
+
+
+ClipFeatures = Tuple[
+    Optional[torch.Tensor],  # audio
+    Optional[torch.Tensor],  # image
+    Optional[torch.Tensor]   # audio
+]
+
+
+ClipLogits = Tuple[
+    Optional[torch.Tensor],  # audio x image
+    Optional[torch.Tensor],  # audio x text
+    Optional[torch.Tensor]   # image x text
+]
+
+
+ClipOutput = Tuple[
+    Tuple[ClipFeatures, ClipLogits],
+    Optional[torch.Tensor]   # loss
+]
+
+
+class AudioCLIP(CLIP):
+
+    def __init__(self,
+                 embed_dim: int = 1024,
+                 # vision
+                 image_resolution: int = 224,
+                 vision_layers: Union[Tuple[int, int, int, int], int] = (3, 4, 6, 3),
+                 vision_width: int = 64,
+                 vision_patch_size: Optional[int] = None,
+                 # text
+                 context_length: int = 77,
+                 vocab_size: int = 49408,
+                 transformer_width: int = 512,
+                 transformer_heads: int = 8,
+                 transformer_layers: int = 12,
+                 # audio
+                 n_fft: int = 2048,
+                 hop_length: Optional[int] = 561,
+                 win_length: Optional[int] = 1654,
+                 window: Optional[str] = 'blackmanharris',
+                 normalized: bool = True,
+                 onesided: bool = True,
+                 spec_height: int = -1,
+                 spec_width: int = -1,
+                 apply_attention: bool = True,
+                 multilabel: bool = True,
+                 pretrained: Union[bool, str] = True):
+
+        super(AudioCLIP, self).__init__(
+            embed_dim=embed_dim,
+            image_resolution=image_resolution,
+            vision_layers=vision_layers,
+            vision_width=vision_width,
+            vision_patch_size=vision_patch_size,
+            context_length=context_length,
+            vocab_size=vocab_size,
+            transformer_width=transformer_width,
+            transformer_heads=transformer_heads,
+            transformer_layers=transformer_layers
+        )
+
+        self.audio = ESResNeXtFBSP(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            normalized=normalized,
+            onesided=onesided,
+            spec_height=spec_height,
+            spec_width=spec_width,
+            num_classes=embed_dim,
+            apply_attention=apply_attention,
+            pretrained=False
+        )
+
+        self.multilabel = multilabel
+        self.pretrained = pretrained
+
+        self.logit_scale_ai = torch.nn.Parameter(torch.log(torch.ones([]) * 100))
+        self.logit_scale_at = torch.nn.Parameter(torch.log(torch.ones([]) * 100))
+
+        if isinstance(self.pretrained, str):
+            self.load_state_dict(torch.load(self.pretrained, map_location='cpu'), strict=False)
+        elif self.pretrained:
+            self.load_state_dict(torch.load(
+                os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets', 'CLIP.pt'),
+                map_location='cpu'
+            ), strict=False)
+            print('Image & Text weights loaded')
+            try:
+                self.audio.load_state_dict(torch.load(
+                    os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets', 'ESRNXFBSP.pt'),
+                    map_location='cpu'
+                ), strict=False)
+            except RuntimeError as ex:
+                print(ex)
+                print('Audio weights loaded')
+
+        self.embed_dim = embed_dim
+
+    @property
+    def device(self):
+        return self.visual.conv1.weight.device
+
+    def encode_audio(self, audio: torch.Tensor) -> torch.Tensor:
+        return self.audio(audio.to(self.device))
+
+    def encode_text(self,
+                    text: List[List[str]],
+                    base_str: str = '{}',
+                    batch_indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if batch_indices is not None:
+            text = [text[idx] for idx in batch_indices]
+
+        text_joined = [', '.join(entities) for entities in text]
+        text_tokens = torch.cat([
+            tokenize(base_str.format(entities)) for entities in text_joined
+        ])
+        text_tokens = text_tokens.to(self.device)
+
+        return super(AudioCLIP, self).encode_text(text_tokens)
+
+    def forward(self,
+                audio: Optional[torch.Tensor] = None,
+                image: Optional[torch.Tensor] = None,
+                text: Optional[List[List[str]]] = None,
+                batch_indices: Optional[torch.Tensor] = None) -> ClipOutput:
+
+        audio_features = None
+        image_features = None
+        text_features = None
+        sample_weights = None
+
+        if audio is not None:
+            audio_features = self.encode_audio(audio)
+            audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
+
+        if image is not None:
+            image_features = self.encode_image(image)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+
+        if text is not None:
+            if batch_indices is None:
+                batch_indices = torch.arange(len(text), dtype=self.dtype, device=self.device)
+
+            text_features = self.encode_text(text, '{}', batch_indices)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+
+            if hasattr(self, 'class_weights') and hasattr(self, 'label_to_class_idx'):
+                sample_weights = torch.stack([
+                    sum(self.class_weights[self.label_to_class_idx[label]] for label in entities)
+                    for idx, entities in enumerate(text) if idx in batch_indices
+                ])
+
+        features: ClipFeatures = (audio_features, image_features, text_features)
+
+        logit_scale_ai = torch.clamp(self.logit_scale_ai.exp(), min=1.0, max=100.0)
+        logit_scale_at = torch.clamp(self.logit_scale_at.exp(), min=1.0, max=100.0)
+        logit_scale_it = torch.clamp(self.logit_scale.exp(), min=1.0, max=100.0)
+
+        logits_audio_image = None
+        logits_audio_text = None
+        logits_image_text = None
+
+        if (audio_features is not None) and (image_features is not None):
+            logits_audio_image = logit_scale_ai * audio_features @ image_features.T
+
+        if (audio_features is not None) and (text_features is not None):
+            logits_audio_text = logit_scale_at * audio_features @ text_features.T
+
+        if (image_features is not None) and (text_features is not None):
+            logits_image_text = logit_scale_it * image_features @ text_features.T
+
+        logits: ClipLogits = (logits_audio_image, logits_audio_text, logits_image_text)
+
+        loss = self.loss_fn(logits, sample_weights)
+        if audio is not None and loss is not None:
+            loss = loss + self.audio.loss_ttf(self.device)
+
+        return (features, logits), loss
+
+    def loss_fn(self, logits: ClipLogits, sample_weights: Optional[torch.Tensor] = None) -> Optional[torch.Tensor]:
+        logits_audio_image, logits_audio_text, logits_image_text = logits
+
+        if logits_audio_image is not None:
+            batch_size = logits_audio_image.shape[0]
+        elif logits_audio_text is not None:
+            batch_size = logits_audio_text.shape[0]
+        elif logits_image_text is not None:
+            batch_size = logits_image_text.shape[0]
+        else:
+            return None
+
+        reference = torch.arange(
+            batch_size,
+            dtype=torch.int64,
+            device=self.device
+        )
+
+        loss = torch.tensor(0.0, dtype=self.dtype, device=self.device)
+
+        num_modalities: int = 0
+        scale = torch.tensor(1.0, dtype=self.dtype, device=self.device)
+
+        if logits_audio_image is not None:
+            loss_ai = F.cross_entropy(
+                logits_audio_image, reference, weight=sample_weights
+            ) + F.cross_entropy(
+                logits_audio_image.transpose(-1, -2), reference, weight=sample_weights
+            )
+            loss = loss + loss_ai
+            num_modalities += 1
+
+        if logits_audio_text is not None:
+            loss_at = F.cross_entropy(
+                logits_audio_text, reference, weight=sample_weights
+            ) + F.cross_entropy(
+                logits_audio_text.transpose(-1, -2), reference, weight=sample_weights
+            )
+            loss = loss + loss_at
+            num_modalities += 1
+
+        if logits_image_text is not None:
+            loss_it = F.cross_entropy(
+                logits_image_text, reference, weight=sample_weights
+            ) + F.cross_entropy(
+                logits_image_text.transpose(-1, -2), reference, weight=sample_weights
+            )
+            loss = loss + loss_it
+            num_modalities += 1
+
+        for idx in range(num_modalities):
+            scale = scale * (idx + 1)
+
+        return loss / scale
+
+    @property
+    def loss_fn_name(self) -> str:
+        return 'Cross Entropy'
diff --git a/model/clip/__init__.py b/model/clip/__init__.py
new file mode 100644
index 0000000..b235f83
--- /dev/null
+++ b/model/clip/__init__.py
@@ -0,0 +1,5 @@
+from .model import CLIP
+from .model import convert_weights
+
+
+__all__ = ['CLIP', 'convert_weights']
diff --git a/model/clip/clip.py b/model/clip/clip.py
new file mode 100644
index 0000000..c4776ef
--- /dev/null
+++ b/model/clip/clip.py
@@ -0,0 +1,193 @@
+# CREDITS: https://github.com/openai/CLIP
+
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Union, List
+
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+
+from .model import build_model
+from utils.simple_tokenizer import SimpleTokenizer as _Tokenizer
+
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+}
+
+
+def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+
+    return download_target
+
+
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=Image.BICUBIC),
+        CenterCrop(n_px),
+        lambda image: image.convert("RGB"),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+
+
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True):
+    """Load a CLIP model
+
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+
+    device : Union[str, torch.device]
+        The device to put the loaded model
+
+    jit : bool
+        Whether to load the optimized JIT model (default) or more hackable non-JIT model.
+
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name])
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+
+    def patch_device(module):
+        graphs = [module.graph] if hasattr(module, "graph") else []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            graphs = [module.graph] if hasattr(module, "graph") else []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, _transform(model.input_resolution.item())
+
+
+def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/model/clip/model.py b/model/clip/model.py
new file mode 100644
index 0000000..356482d
--- /dev/null
+++ b/model/clip/model.py
@@ -0,0 +1,433 @@
+# CREDITS: https://github.com/openai/CLIP
+
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+
+        return x[0]
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisualTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+
+        self.context_length = context_length
+
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]))
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()
diff --git a/model/esresnet/__init__.py b/model/esresnet/__init__.py
new file mode 100644
index 0000000..684cdc6
--- /dev/null
+++ b/model/esresnet/__init__.py
@@ -0,0 +1,8 @@
+from .base import ESResNet
+from .base import ESResNeXt
+from .fbsp import ESResNetFBSP
+from .fbsp import ESResNeXtFBSP
+from .attention import Attention2d
+
+
+__all__ = ['ESResNet', 'ESResNeXt', 'ESResNetFBSP', 'ESResNeXtFBSP', 'Attention2d']
diff --git a/model/esresnet/attention.py b/model/esresnet/attention.py
new file mode 100644
index 0000000..fb3c0c3
--- /dev/null
+++ b/model/esresnet/attention.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn.functional as F
+
+from typing import Tuple
+
+
+class Attention2d(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 num_kernels: int,
+                 kernel_size: Tuple[int, int],
+                 padding_size: Tuple[int, int]):
+
+        super(Attention2d, self).__init__()
+
+        self.conv_depth = torch.nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels * num_kernels,
+            kernel_size=kernel_size,
+            padding=padding_size,
+            groups=in_channels
+        )
+        self.conv_point = torch.nn.Conv2d(
+            in_channels=in_channels * num_kernels,
+            out_channels=out_channels,
+            kernel_size=(1, 1)
+        )
+        self.bn = torch.nn.BatchNorm2d(num_features=out_channels)
+        self.activation = torch.nn.Sigmoid()
+
+    def forward(self, x: torch.Tensor, size: torch.Size) -> torch.Tensor:
+        x = F.adaptive_max_pool2d(x, size)
+        x = self.conv_depth(x)
+        x = self.conv_point(x)
+        x = self.bn(x)
+        x = self.activation(x)
+
+        return x
diff --git a/model/esresnet/base.py b/model/esresnet/base.py
new file mode 100644
index 0000000..a2a0acc
--- /dev/null
+++ b/model/esresnet/base.py
@@ -0,0 +1,708 @@
+import termcolor
+
+import numpy as np
+import scipy.signal as sps
+
+import torch
+import torch.nn.functional as F
+
+import torchvision as tv
+
+import ignite_trainer as it
+
+from model.esresnet import attention
+from utils.transforms import scale
+
+from typing import cast
+from typing import List
+from typing import Type
+from typing import Tuple
+from typing import Union
+from typing import Optional
+
+
+def conv3x3(in_planes: int, out_planes: int, stride=1, groups: int = 1, dilation: Union[int, Tuple[int, int]] = 1):
+    """
+    CREDITS: https://github.com/pytorch/vision
+    3x3 convolution with padding
+    """
+    return torch.nn.Conv2d(
+        in_channels=in_planes,
+        out_channels=out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: Union[int, Tuple[int, int]] = 1):
+    """
+    CREDITS: https://github.com/pytorch/vision
+    1x1 convolution
+    """
+    return torch.nn.Conv2d(
+        in_channels=in_planes,
+        out_channels=out_planes,
+        kernel_size=1,
+        stride=stride,
+        bias=False
+    )
+
+
+class BasicBlock(torch.nn.Module):
+
+    """
+    CREDITS: https://github.com/pytorch/vision
+    """
+
+    expansion: int = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 downsample: Optional[torch.nn.Module] = None,
+                 groups: int = 1,
+                 base_width: int = 64,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 norm_layer: Optional[Type[torch.nn.Module]] = None):
+
+        super(BasicBlock, self).__init__()
+
+        if norm_layer is None:
+            norm_layer = torch.nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = torch.nn.ReLU()
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(torch.nn.Module):
+
+    """
+    CREDITS: https://github.com/pytorch/vision
+    """
+
+    expansion: int = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 downsample: Optional[torch.nn.Module] = None,
+                 groups: int = 1,
+                 base_width: int = 64,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 norm_layer: Optional[Type[torch.nn.Module]] = None):
+
+        super(Bottleneck, self).__init__()
+
+        if norm_layer is None:
+            norm_layer = torch.nn.BatchNorm2d
+
+        width = int(planes * (base_width / 64.0)) * groups
+
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = torch.nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNetWithAttention(it.AbstractNet):
+
+    """
+    CREDITS: https://github.com/pytorch/vision
+    """
+
+    def __init__(self,
+                 block: Type[Union[BasicBlock, Bottleneck]],
+                 layers: List[int],
+                 apply_attention: bool = False,
+                 num_channels: int = 3,
+                 num_classes: int = 1000,
+                 zero_init_residual: bool = False,
+                 groups: int = 1,
+                 width_per_group: int = 64,
+                 replace_stride_with_dilation: bool = None,
+                 norm_layer: Optional[Type[torch.nn.Module]] = None):
+
+        super(ResNetWithAttention, self).__init__()
+
+        self.apply_attention = apply_attention
+
+        if norm_layer is None:
+            norm_layer = torch.nn.BatchNorm2d
+
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                f'replace_stride_with_dilation should be None or a 3-element tuple, got {replace_stride_with_dilation}'
+            )
+
+        self.groups = groups
+        self.base_width = width_per_group
+
+        self.conv1 = torch.nn.Conv2d(num_channels, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = torch.nn.ReLU()
+        self.maxpool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        if self.apply_attention:
+            self.att1 = attention.Attention2d(
+                in_channels=64,
+                out_channels=64 * block.expansion,
+                num_kernels=1,
+                kernel_size=(3, 1),
+                padding_size=(1, 0)
+            )
+
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
+        if self.apply_attention:
+            self.att2 = attention.Attention2d(
+                in_channels=64 * block.expansion,
+                out_channels=128 * block.expansion,
+                num_kernels=1,
+                kernel_size=(1, 5),
+                padding_size=(0, 2)
+            )
+
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
+        if self.apply_attention:
+            self.att3 = attention.Attention2d(
+                in_channels=128 * block.expansion,
+                out_channels=256 * block.expansion,
+                num_kernels=1,
+                kernel_size=(3, 1),
+                padding_size=(1, 0)
+            )
+
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
+        if self.apply_attention:
+            self.att4 = attention.Attention2d(
+                in_channels=256 * block.expansion,
+                out_channels=512 * block.expansion,
+                num_kernels=1,
+                kernel_size=(1, 5),
+                padding_size=(0, 2)
+            )
+
+        self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
+        if self.apply_attention:
+            self.att5 = attention.Attention2d(
+                in_channels=512 * block.expansion,
+                out_channels=512 * block.expansion,
+                num_kernels=1,
+                kernel_size=(3, 5),
+                padding_size=(1, 2)
+            )
+
+        self.fc = torch.nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (torch.nn.BatchNorm2d, torch.nn.GroupNorm)):
+                torch.nn.init.constant_(m.weight, 1)
+                torch.nn.init.constant_(m.bias, 0)
+
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    torch.nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    torch.nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self,
+                    block: Type[Union[BasicBlock, Bottleneck]],
+                    planes: int,
+                    blocks: int,
+                    stride: Union[int, Tuple[int, int]] = 1,
+                    dilate: bool = False) -> torch.nn.Module:
+
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = torch.nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion)
+            )
+
+        layers = list()
+        layers.append(block(
+            self.inplanes,
+            planes,
+            stride,
+            downsample,
+            self.groups,
+            self.base_width,
+            previous_dilation,
+            norm_layer
+        ))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(
+                self.inplanes,
+                planes,
+                groups=self.groups,
+                base_width=self.base_width,
+                dilation=self.dilation,
+                norm_layer=norm_layer
+            ))
+
+        return torch.nn.Sequential(*layers)
+
+    def _forward_pre_processing(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.to(torch.get_default_dtype())
+
+        return x
+
+    def _forward_pre_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        return x
+
+    def _forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._forward_pre_features(x)
+
+        if self.apply_attention:
+            x_att = x.clone()
+            x = self.layer1(x)
+            x_att = self.att1(x_att, x.shape[-2:])
+            x = x * x_att
+
+            x_att = x.clone()
+            x = self.layer2(x)
+            x_att = self.att2(x_att, x.shape[-2:])
+            x = x * x_att
+
+            x_att = x.clone()
+            x = self.layer3(x)
+            x_att = self.att3(x_att, x.shape[-2:])
+            x = x * x_att
+
+            x_att = x.clone()
+            x = self.layer4(x)
+            x_att = self.att4(x_att, x.shape[-2:])
+            x = x * x_att
+        else:
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+
+        return x
+
+    def _forward_reduction(self, x: torch.Tensor) -> torch.Tensor:
+        if self.apply_attention:
+            x_att = x.clone()
+            x = self.avgpool(x)
+            x_att = self.att5(x_att, x.shape[-2:])
+            x = x * x_att
+        else:
+            x = self.avgpool(x)
+
+        x = torch.flatten(x, 1)
+
+        return x
+
+    def _forward_classifier(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc(x)
+
+        return x
+
+    def forward(self,
+                x: torch.Tensor,
+                y: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+
+        x = self._forward_pre_processing(x)
+        x = self._forward_features(x)
+        x = self._forward_reduction(x)
+        y_pred = self._forward_classifier(x)
+
+        loss = None
+        if y is not None:
+            loss = self.loss_fn(y_pred, y).mean()
+
+        return y_pred if loss is None else (y_pred, loss)
+
+    def loss_fn(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        if isinstance(y_pred, tuple):
+            y_pred, *_ = y_pred
+
+        if y_pred.shape == y.shape:
+            loss_pred = F.binary_cross_entropy_with_logits(
+                y_pred,
+                y.to(dtype=y_pred.dtype, device=y_pred.device),
+                reduction='sum'
+            ) / y_pred.shape[0]
+        else:
+            loss_pred = F.cross_entropy(y_pred, y.to(y_pred.device))
+
+        return loss_pred
+
+    @property
+    def loss_fn_name(self) -> str:
+        return 'Cross Entropy'
+
+
+class _ESResNet(ResNetWithAttention):
+
+    @staticmethod
+    def loading_function(*args, **kwargs) -> torch.nn.Module:
+        raise NotImplementedError
+
+    def __init__(self,
+                 block: Type[Union[BasicBlock, Bottleneck]],
+                 layers: List[int],
+                 apply_attention: bool = False,
+                 n_fft: int = 256,
+                 hop_length: Optional[int] = None,
+                 win_length: Optional[int] = None,
+                 window: Optional[str] = None,
+                 normalized: bool = False,
+                 onesided: bool = True,
+                 spec_height: int = 224,
+                 spec_width: int = 224,
+                 num_classes: int = 1000,
+                 pretrained: Union[bool, str] = False,
+                 lock_pretrained: Optional[Union[bool, List[str]]] = None,
+                 zero_init_residual: bool = False,
+                 groups: int = 1,
+                 width_per_group: int = 64,
+                 replace_stride_with_dilation: bool = None,
+                 norm_layer: Optional[Type[torch.nn.Module]] = None):
+
+        super(_ESResNet, self).__init__(
+            block=block,
+            layers=layers,
+            apply_attention=apply_attention,
+            num_channels=3,
+            num_classes=num_classes,
+            zero_init_residual=zero_init_residual,
+            groups=groups,
+            width_per_group=width_per_group,
+            replace_stride_with_dilation=replace_stride_with_dilation,
+            norm_layer=norm_layer
+        )
+
+        self.num_classes = num_classes
+
+        self.fc = torch.nn.Linear(
+            in_features=self.fc.in_features,
+            out_features=self.num_classes,
+            bias=self.fc.bias is not None
+        )
+
+        if hop_length is None:
+            hop_length = int(np.floor(n_fft / 4))
+
+        if win_length is None:
+            win_length = n_fft
+
+        if window is None:
+            window = 'boxcar'
+
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+
+        self.normalized = normalized
+        self.onesided = onesided
+
+        self.spec_height = spec_height
+        self.spec_width = spec_width
+
+        self.pretrained = pretrained
+        self._inject_members()
+        if pretrained:
+            err_msg = self.load_pretrained()
+
+            unlocked_weights = list()
+
+            for name, p in self.named_parameters():
+                unlock = True
+                if isinstance(lock_pretrained, bool):
+                    if lock_pretrained and name not in err_msg:
+                        unlock = False
+                elif isinstance(lock_pretrained, list):
+                    if name in lock_pretrained:
+                        unlock = False
+
+                p.requires_grad_(unlock)
+                if unlock:
+                    unlocked_weights.append(name)
+
+            print(f'Following weights are unlocked: {unlocked_weights}')
+
+        window_buffer: torch.Tensor = torch.from_numpy(
+            sps.get_window(window=window, Nx=win_length, fftbins=True)
+        ).to(torch.get_default_dtype())
+        self.register_buffer('window', window_buffer)
+
+        self.log10_eps = 1e-18
+
+        if self.apply_attention and pretrained and not isinstance(pretrained, str):
+            self._reset_attention()
+
+    def _inject_members(self):
+        pass
+
+    def _reset_attention(self):
+        print(termcolor.colored('Resetting attention blocks', 'green'))
+
+        self.att1.bn.weight.data.fill_(1.0)
+        self.att1.bn.bias.data.fill_(1.0)
+
+        self.att2.bn.weight.data.fill_(1.0)
+        self.att2.bn.bias.data.fill_(1.0)
+
+        self.att3.bn.weight.data.fill_(1.0)
+        self.att3.bn.bias.data.fill_(1.0)
+
+        self.att4.bn.weight.data.fill_(1.0)
+        self.att4.bn.bias.data.fill_(1.0)
+
+        self.att5.bn.weight.data.fill_(1.0)
+        self.att5.bn.bias.data.fill_(1.0)
+
+    def load_pretrained(self) -> str:
+        if isinstance(self.pretrained, bool):
+            state_dict = self.loading_func(pretrained=True).state_dict()
+        else:
+            state_dict = torch.load(self.pretrained, map_location='cpu')
+
+        err_msg = ''
+        try:
+            self.load_state_dict(state_dict=state_dict, strict=True)
+        except RuntimeError as ex:
+            err_msg += f'While loading some errors occurred.\n{ex}'
+            print(termcolor.colored(err_msg, 'red'))
+
+        return err_msg
+
+    def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
+        spec = torch.stft(
+            x.view(-1, x.shape[-1]),
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            pad_mode='reflect',
+            normalized=self.normalized,
+            onesided=True
+        )
+
+        if not self.onesided:
+            spec = torch.cat((torch.flip(spec, dims=(-3,)), spec), dim=-3)
+
+        return spec
+
+    def split_spectrogram(self, spec: torch.Tensor, batch_size: int) -> torch.Tensor:
+        spec_height_per_band = spec.shape[-3] // self.conv1.in_channels
+        spec_height_single_band = self.conv1.in_channels * spec_height_per_band
+        spec = spec[:, :spec_height_single_band]
+
+        spec = spec.reshape(batch_size, -1, spec.shape[-3] // self.conv1.in_channels, *spec.shape[-2:])
+
+        return spec
+
+    def spectrogram_to_power(self, spec: torch.Tensor) -> torch.Tensor:
+        spec_height = spec.shape[-3] if self.spec_height < 1 else self.spec_height
+        spec_width = spec.shape[-2] if self.spec_width < 1 else self.spec_width
+
+        pow_spec = spec[..., 0] ** 2 + spec[..., 1] ** 2
+
+        if spec_height != pow_spec.shape[-2] or spec_width != pow_spec.shape[-1]:
+            pow_spec = F.interpolate(
+                pow_spec,
+                size=(spec_height, spec_width),
+                mode='bilinear',
+                align_corners=True
+            )
+
+        return pow_spec
+
+    def _forward_pre_processing(self, x: torch.Tensor) -> torch.Tensor:
+        x = super(_ESResNet, self)._forward_pre_processing(x)
+        x = scale(x, -32768.0, 32767, -1.0, 1.0)
+
+        spec = self.spectrogram(x)
+        spec_split_ch = self.split_spectrogram(spec, x.shape[0])
+        pow_spec_split_ch = self.spectrogram_to_power(spec_split_ch)
+        pow_spec_split_ch = torch.where(
+            cast(torch.Tensor, pow_spec_split_ch > 0.0),
+            pow_spec_split_ch,
+            torch.full_like(pow_spec_split_ch, self.log10_eps)
+        )
+        pow_spec_split_ch = pow_spec_split_ch.reshape(
+            x.shape[0], -1, self.conv1.in_channels, *pow_spec_split_ch.shape[-2:]
+        )
+        x_db = torch.log10(pow_spec_split_ch).mul(10.0)
+
+        return x_db
+
+    def _forward_features(self, x_db: torch.Tensor) -> List[torch.Tensor]:
+        outputs = list()
+        for ch_idx in range(x_db.shape[1]):
+            ch = x_db[:, ch_idx]
+            out = super(_ESResNet, self)._forward_features(ch)
+            outputs.append(out)
+
+        return outputs
+
+    def _forward_reduction(self, x: List[torch.Tensor]) -> torch.Tensor:
+        outputs = list()
+        for ch in x:
+            out = super(_ESResNet, self)._forward_reduction(ch)
+            outputs.append(out)
+        outputs = torch.stack(outputs, dim=-1).sum(dim=-1)
+
+        return outputs
+
+
+class ESResNet(_ESResNet):
+
+    loading_func = staticmethod(tv.models.resnet50)
+
+    def __init__(self,
+                 n_fft: int = 256,
+                 hop_length: Optional[int] = None,
+                 win_length: Optional[int] = None,
+                 window: Optional[str] = None,
+                 normalized: bool = False,
+                 onesided: bool = True,
+                 spec_height: int = 224,
+                 spec_width: int = 224,
+                 num_classes: int = 1000,
+                 apply_attention: bool = False,
+                 pretrained: bool = False,
+                 lock_pretrained: Optional[Union[bool, List[str]]] = None):
+
+        super(ESResNet, self).__init__(
+            block=Bottleneck,
+            layers=[3, 4, 6, 3],
+            apply_attention=apply_attention,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            normalized=normalized,
+            onesided=onesided,
+            spec_height=spec_height,
+            spec_width=spec_width,
+            num_classes=num_classes,
+            pretrained=pretrained,
+            lock_pretrained=lock_pretrained
+        )
+
+
+class ESResNeXt(_ESResNet):
+
+    loading_func = staticmethod(tv.models.resnext50_32x4d)
+
+    def __init__(self,
+                 n_fft: int = 256,
+                 hop_length: Optional[int] = None,
+                 win_length: Optional[int] = None,
+                 window: Optional[str] = None,
+                 normalized: bool = False,
+                 onesided: bool = True,
+                 spec_height: int = 224,
+                 spec_width: int = 224,
+                 num_classes: int = 1000,
+                 apply_attention: bool = False,
+                 pretrained: Union[bool, str] = False,
+                 lock_pretrained: Optional[Union[bool, List[str]]] = None):
+
+        super(ESResNeXt, self).__init__(
+            block=Bottleneck,
+            layers=[3, 4, 6, 3],
+            apply_attention=apply_attention,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            normalized=normalized,
+            onesided=onesided,
+            spec_height=spec_height,
+            spec_width=spec_width,
+            num_classes=num_classes,
+            pretrained=pretrained,
+            lock_pretrained=lock_pretrained,
+            groups=32,
+            width_per_group=4
+        )
diff --git a/model/esresnet/fbsp.py b/model/esresnet/fbsp.py
new file mode 100644
index 0000000..15d329d
--- /dev/null
+++ b/model/esresnet/fbsp.py
@@ -0,0 +1,247 @@
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+
+import torchvision as tv
+
+from utils import transforms
+from model.esresnet.base import _ESResNet
+from model.esresnet.base import Bottleneck
+
+from typing import cast
+from typing import List
+from typing import Tuple
+from typing import Union
+from typing import Optional
+
+
+class LinearFBSP(torch.nn.Module):
+
+    def __init__(self, out_features: int, bias: bool = True, normalized: bool = False):
+        super(LinearFBSP, self).__init__()
+
+        self.out_features = out_features
+        self.normalized = normalized
+        self.eps = 1e-8
+
+        default_dtype = torch.get_default_dtype()
+
+        self.register_parameter('m', torch.nn.Parameter(torch.zeros(self.out_features, dtype=default_dtype)))
+        self.register_parameter('fb', torch.nn.Parameter(torch.ones(self.out_features, dtype=default_dtype)))
+        self.register_parameter('fc', torch.nn.Parameter(torch.arange(self.out_features, dtype=default_dtype)))
+        self.register_parameter(
+            'bias',
+            torch.nn.Parameter(
+                torch.normal(
+                    0.0, 0.5, (self.out_features, 2), dtype=default_dtype
+                ) if bias else cast(
+                    torch.nn.Parameter, None
+                )
+            )
+        )
+
+        self.m.register_hook(lambda grad: grad / (torch.norm(grad, p=float('inf')) + self.eps))
+        self.fb.register_hook(lambda grad: grad / (torch.norm(grad, p=float('inf')) + self.eps))
+        self.fc.register_hook(lambda grad: grad / (torch.norm(grad, p=float('inf')) + self.eps))
+
+    @staticmethod
+    def power(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        magnitudes = (x1[..., 0] ** 2 + x1[..., 1] ** 2) ** 0.5
+        phases = x1[..., 1].atan2(x1[..., 0])
+
+        power_real = x2[..., 0]
+        power_imag = x2[..., 1]
+
+        mag_out = ((magnitudes ** 2) ** (0.5 * power_real) * torch.exp(-power_imag * phases))
+
+        return mag_out.unsqueeze(-1) * torch.stack((
+            (power_real * phases + 0.5 * power_imag * (magnitudes ** 2).log()).cos(),
+            (power_real * phases + 0.5 * power_imag * (magnitudes ** 2).log()).sin()
+        ), dim=-1)
+
+    @staticmethod
+    def sinc(x: torch.Tensor) -> torch.Tensor:
+        return torch.where(cast(torch.Tensor, x == 0), torch.ones_like(x), torch.sin(x) / x)
+
+    def _materialize_weights(self, x: torch.Tensor) -> Tuple[torch.Tensor, bool]:
+        x_is_complex = x.shape[-1] == 2
+        in_features = x.shape[-1 - int(x_is_complex)]
+
+        t = np.pi * torch.linspace(-1.0, 1.0, in_features, dtype=x.dtype, device=x.device).reshape(1, -1, 1) + self.eps
+
+        m = self.m.reshape(-1, 1, 1)
+        fb = self.fb.reshape(-1, 1, 1)
+        fc = self.fc.reshape(-1, 1, 1)
+
+        kernel = torch.cat((torch.cos(fc * t), -torch.sin(fc * t)), dim=-1)  # complex
+        scale = fb.sqrt()  # real
+        win = self.sinc(fb * t / (m + self.eps))  # real
+        win = self.power(
+            torch.cat((win, torch.zeros_like(win)), dim=-1),
+            torch.cat((m, torch.zeros_like(m)), dim=-1)
+        )  # complex
+
+        weights = scale * torch.cat((
+            win[..., :1] * kernel[..., :1] - win[..., 1:] * kernel[..., 1:],
+            win[..., :1] * kernel[..., 1:] + win[..., 1:] * kernel[..., :1]
+        ), dim=-1)
+
+        if self.normalized:
+            weights = weights / (in_features ** 0.5)
+
+        return weights, x_is_complex
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        weights, x_is_complex = self._materialize_weights(x)
+
+        if x_is_complex:
+            x = torch.stack((
+                F.linear(x[..., 0], weights[..., 0]) - F.linear(x[..., 1], weights[..., 1]),
+                F.linear(x[..., 0], weights[..., 1]) + F.linear(x[..., 1], weights[..., 0])
+            ), dim=-1)
+        else:
+            x = torch.stack((
+                F.linear(x, weights[..., 0]),
+                F.linear(x, weights[..., 1])
+            ), dim=-1)
+
+        if (self.bias is not None) and (self.bias.numel() == (self.out_features * 2)):
+            x = x + self.bias
+
+        return x, weights
+
+    def extra_repr(self) -> str:
+        return 'out_features={}, bias={}, normalized={}'.format(
+            self.out_features,
+            (self.bias is not None) and (self.bias.numel() == (self.out_features * 2)),
+            self.normalized
+        )
+
+
+ttf_weights = dict()
+
+
+class _ESResNetFBSP(_ESResNet):
+
+    def _inject_members(self):
+        self.add_module(
+            'fbsp',
+            LinearFBSP(
+                out_features=int(round(self.n_fft / 2)) + 1 if self.onesided else self.n_fft,
+                normalized=self.normalized,
+                bias=False
+            )
+        )
+
+    def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad():
+            frames = transforms.frame_signal(
+                signal=x.view(-1, x.shape[-1]),
+                frame_length=self.win_length,
+                hop_length=self.hop_length,
+                window=self.window
+            )
+
+            if self.n_fft > self.win_length:
+                pad_length = self.n_fft - self.win_length
+                pad_left = pad_length // 2
+                pad_right = pad_length - pad_left
+                frames = F.pad(frames, [pad_left, pad_right])
+
+        spec, ttf_weights_ = self.fbsp(frames)
+
+        spec = spec.transpose(-2, -3)
+        ttf_weights[x.device] = ttf_weights_
+
+        return spec
+
+    def loss_ttf(self, device: torch.device) -> torch.Tensor:
+        ttf_norm = torch.norm(ttf_weights[device], p=2, dim=[-1, -2])
+        loss_ttf_norm = F.mse_loss(
+            ttf_norm,
+            torch.full_like(ttf_norm, 1.0 if self.normalized else self.n_fft ** 0.5)
+        )
+
+        return loss_ttf_norm
+
+    def loss_fn(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        loss_pred = super(_ESResNetFBSP, self).loss_fn(y_pred, y)
+        loss_ttf_norm = self.loss_ttf(y_pred.device)
+        loss = loss_pred + loss_ttf_norm
+
+        return loss
+
+
+class ESResNetFBSP(_ESResNetFBSP):
+
+    loading_func = staticmethod(tv.models.resnet50)
+
+    def __init__(self,
+                 n_fft: int = 256,
+                 hop_length: Optional[int] = None,
+                 win_length: Optional[int] = None,
+                 window: Optional[str] = None,
+                 normalized: bool = False,
+                 onesided: bool = True,
+                 spec_height: int = 224,
+                 spec_width: int = 224,
+                 num_classes: int = 1000,
+                 apply_attention: bool = False,
+                 pretrained: bool = False,
+                 lock_pretrained: Optional[Union[bool, List[str]]] = None):
+
+        super(ESResNetFBSP, self).__init__(
+            block=Bottleneck,
+            layers=[3, 4, 6, 3],
+            apply_attention=apply_attention,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            normalized=normalized,
+            onesided=onesided,
+            spec_height=spec_height,
+            spec_width=spec_width,
+            num_classes=num_classes,
+            pretrained=pretrained,
+            lock_pretrained=lock_pretrained
+        )
+
+
+class ESResNeXtFBSP(_ESResNetFBSP):
+
+    loading_func = staticmethod(tv.models.resnext50_32x4d)
+
+    def __init__(self,
+                 n_fft: int = 256,
+                 hop_length: Optional[int] = None,
+                 win_length: Optional[int] = None,
+                 window: Optional[str] = None,
+                 normalized: bool = False,
+                 onesided: bool = True,
+                 spec_height: int = 224,
+                 spec_width: int = 224,
+                 num_classes: int = 1000,
+                 apply_attention: bool = False,
+                 pretrained: Union[bool, str] = False,
+                 lock_pretrained: Optional[Union[bool, List[str]]] = None):
+
+        super(ESResNeXtFBSP, self).__init__(
+            block=Bottleneck,
+            layers=[3, 4, 6, 3],
+            apply_attention=apply_attention,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            normalized=normalized,
+            onesided=onesided,
+            spec_height=spec_height,
+            spec_width=spec_width,
+            num_classes=num_classes,
+            pretrained=pretrained,
+            lock_pretrained=lock_pretrained,
+            groups=32,
+            width_per_group=4
+        )
diff --git a/protocols/README.md b/protocols/README.md
new file mode 100644
index 0000000..48e72c8
--- /dev/null
+++ b/protocols/README.md
@@ -0,0 +1,3 @@
+# Protocols
+
+Here are the JSON-files that describe configurations of experiments.
diff --git a/protocols/audioclip-esc50.json b/protocols/audioclip-esc50.json
new file mode 100644
index 0000000..32e1ada
--- /dev/null
+++ b/protocols/audioclip-esc50.json
@@ -0,0 +1,108 @@
+{
+  "Visdom": {
+    "host": null,
+    "port": null,
+    "env_path": null
+  },
+  "Setup": {
+    "name": "Multimodal-Audio",
+    "suffix": "CV1",
+    "batch_train": 64,
+    "batch_test": 64,
+    "workers_train": 4,
+    "workers_test": 4,
+    "epochs": 50,
+    "log_interval": 10,
+    "saved_models_path": "/path/to/saved/models"
+  },
+  "Model": {
+    "class": "model.audioclip.AudioCLIP",
+    "args": {
+      "multilabel": false,
+      "pretrained": "/path/to/assets/trained_AudioCLIP.pt"
+    }
+  },
+  "Optimizer": {
+    "class": "torch.optim.SGD",
+    "args": {
+      "lr": 5e-5,
+      "momentum": 0.9,
+      "nesterov": true,
+      "weight_decay": 5e-4
+    }
+  },
+  "Scheduler": {
+    "class": "torch.optim.lr_scheduler.ExponentialLR",
+    "args": {
+      "gamma": 0.96
+    }
+  },
+  "Dataset": {
+    "class": "utils.datasets.ESC50",
+    "args": {
+      "dl_shuffle": true,
+      "root": "/path/to/ESC50",
+      "sample_rate": 44100,
+      "fold": 1,
+      "training": {"key": "train", "yes": true, "no": false}
+    }
+  },
+  "Transforms": [
+    {
+      "class": "utils.transforms.ToTensor1D",
+      "args": {}
+    },
+    {
+      "class": "utils.transforms.RandomFlip",
+      "args": {"p": 0.5},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomScale",
+      "args": {"max_scale": 1.50},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomPadding",
+      "args": {"out_len": 220500},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomCrop",
+      "args": {"out_len": 220500},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomNoise",
+      "args": {"snr_min_db": 10.0, "snr_max_db": 120.0, "p": 0.25},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomPadding",
+      "args": {"out_len": 220500, "train": false},
+      "train": false
+    },
+    {
+      "class": "utils.transforms.RandomCrop",
+      "args": {"out_len": 220500, "train": false},
+      "train": false
+    }
+  ],
+  "Metrics": {
+    "Performance": {
+      "window_name": null,
+      "x_label": "#Epochs",
+      "y_label": "Accuracy",
+      "width": 1890,
+      "height": 416,
+      "lines": [
+        {
+          "line_label": "Val. Acc.",
+          "class": "ignite.metrics.Accuracy",
+          "args": {},
+          "is_checkpoint": true
+        }
+      ]
+    }
+  }
+}
diff --git a/protocols/audioclip-us8k.json b/protocols/audioclip-us8k.json
new file mode 100644
index 0000000..4c1f649
--- /dev/null
+++ b/protocols/audioclip-us8k.json
@@ -0,0 +1,108 @@
+{
+  "Visdom": {
+    "host": null,
+    "port": null,
+    "env_path": null
+  },
+  "Setup": {
+    "name": "Multimodal-Audio",
+    "suffix": "CV01",
+    "batch_train": 64,
+    "batch_test": 64,
+    "workers_train": 4,
+    "workers_test": 4,
+    "epochs": 50,
+    "log_interval": 25,
+    "saved_models_path": "/path/to/saved/models"
+  },
+  "Model": {
+    "class": "model.audioclip.AudioCLIP",
+    "args": {
+      "multilabel": false,
+      "pretrained": "/path/to/assets/trained_AudioCLIP.pt"
+    }
+  },
+  "Optimizer": {
+    "class": "torch.optim.SGD",
+    "args": {
+      "lr": 1e-5,
+      "momentum": 0.9,
+      "nesterov": true,
+      "weight_decay": 5e-4
+    }
+  },
+  "Scheduler": {
+    "class": "torch.optim.lr_scheduler.ExponentialLR",
+    "args": {
+      "gamma": 0.96
+    }
+  },
+  "Dataset": {
+    "class": "utils.datasets.UrbanSound8K",
+    "args": {
+      "root": "/path/to/UrbanSound8K",
+      "sample_rate": 44100,
+      "fold": 1,
+      "mono": false,
+      "training": {"key": "train", "yes": true, "no": false}
+    }
+  },
+  "Transforms": [
+    {
+      "class": "utils.transforms.ToTensor1D",
+      "args": {}
+    },
+    {
+      "class": "utils.transforms.RandomFlip",
+      "args": {"p": 0.5},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomScale",
+      "args": {"max_scale": 1.50},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomPadding",
+      "args": {"out_len": 176400},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomCrop",
+      "args": {"out_len": 176400},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomNoise",
+      "args": {"snr_min_db": 10.0, "snr_max_db": 120.0, "p": 0.25},
+      "test": false
+    },
+    {
+      "class": "utils.transforms.RandomPadding",
+      "args": {"out_len": 176400, "train": false},
+      "train": false
+    },
+    {
+      "class": "utils.transforms.RandomCrop",
+      "args": {"out_len": 176400, "train": false},
+      "train": false
+    }
+  ],
+  "Metrics": {
+    "Performance": {
+      "window_name": null,
+      "x_label": "#Epochs",
+      "y_label": "Accuracy",
+      "width": 1890,
+      "height": 416,
+      "lines": [
+        {
+          "line_label": "Val. Acc.",
+          "class": "ignite.metrics.Accuracy",
+          "args": {},
+          "is_checkpoint": true
+        }
+      ]
+    }
+  }
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f78635a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+librosa==0.7.2
+numpy==1.18.1
+pandas==1.0.3
+pytorch-ignite==0.3.0
+scikit-learn==0.22.1
+scipy==1.4.1
+termcolor==1.1.0
+torch==1.7.1
+torchvision==0.8.2
+tqdm==4.43.0
+visdom==0.1.8.9
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..c5be7f9
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,7 @@
+from . import datasets
+from . import transforms
+
+__all__ = [
+    'datasets',
+    'transforms'
+]
diff --git a/utils/datasets/__init__.py b/utils/datasets/__init__.py
new file mode 100644
index 0000000..cba11e7
--- /dev/null
+++ b/utils/datasets/__init__.py
@@ -0,0 +1,4 @@
+from .esc50 import ESC50
+from .us8k import UrbanSound8K
+
+__all__ = ['ESC50', 'UrbanSound8K']
diff --git a/utils/datasets/esc50.py b/utils/datasets/esc50.py
new file mode 100644
index 0000000..17b0111
--- /dev/null
+++ b/utils/datasets/esc50.py
@@ -0,0 +1,128 @@
+import os
+import warnings
+import multiprocessing as mp
+
+import tqdm
+import librosa
+
+import numpy as np
+import pandas as pd
+
+import torch.utils.data as td
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
+from typing import Optional
+
+
+class ESC50(td.Dataset):
+
+    def __init__(self,
+                 root: str,
+                 sample_rate: int = 22050,
+                 train: bool = True,
+                 fold: Optional[int] = None,
+                 transform_audio=None,
+                 target_transform=None,
+                 **_):
+
+        super(ESC50, self).__init__()
+
+        self.sample_rate = sample_rate
+
+        meta = self.load_meta(os.path.join(root, 'meta', 'esc50.csv'))
+
+        if fold is None:
+            fold = 5
+
+        self.folds_to_load = set(meta['fold'])
+
+        if fold not in self.folds_to_load:
+            raise ValueError(f'fold {fold} does not exist')
+
+        self.train = train
+        self.transform = transform_audio
+
+        if self.train:
+            self.folds_to_load -= {fold}
+        else:
+            self.folds_to_load -= self.folds_to_load - {fold}
+
+        self.data: Dict[Union[str, int], Dict[str, Any]] = dict()
+        self.load_data(meta, os.path.join(root, 'audio'))
+        self.indices = list(self.data.keys())
+
+        self.class_idx_to_label = dict()
+        for row in self.data.values():
+            idx = row['target']
+            label = row['category']
+            self.class_idx_to_label[idx] = label
+        self.label_to_class_idx = {lb: idx for idx, lb in self.class_idx_to_label.items()}
+
+        self.target_transform = target_transform
+
+    @staticmethod
+    def load_meta(path_to_csv: str) -> pd.DataFrame:
+        meta = pd.read_csv(path_to_csv)
+
+        return meta
+
+    @staticmethod
+    def _load_worker(idx: int, filename: str, sample_rate: Optional[int] = None) -> Tuple[int, int, np.ndarray]:
+        wav, sample_rate = librosa.load(filename, sr=sample_rate, mono=True)
+
+        if wav.ndim == 1:
+            wav = wav[:, np.newaxis]
+
+        wav = wav.T * 32768.0
+
+        return idx, sample_rate, wav.astype(np.float32)
+
+    def load_data(self, meta: pd.DataFrame, base_path: str):
+        items_to_load = dict()
+
+        for idx, row in meta.iterrows():
+            if row['fold'] in self.folds_to_load:
+                items_to_load[idx] = os.path.join(base_path, row['filename']), self.sample_rate
+
+        items_to_load = [(idx, path, sample_rate) for idx, (path, sample_rate) in items_to_load.items()]
+
+        num_processes = os.cpu_count()
+        warnings.filterwarnings('ignore')
+        with mp.Pool(processes=num_processes) as pool:
+            tqdm.tqdm.write(f'Loading {self.__class__.__name__} (train={self.train})')
+            for idx, sample_rate, wav in pool.starmap(
+                    func=self._load_worker,
+                    iterable=items_to_load,
+                    chunksize=int(np.ceil(len(items_to_load) / num_processes)) or 1
+            ):
+                row = meta.loc[idx]
+
+                self.data[idx] = {
+                    'audio': wav,
+                    'sample_rate': sample_rate,
+                    'target': row['target'],
+                    'category': row['category'].replace('_', ' '),
+                    'fold': row['fold'],
+                    'esc10': row['esc10']
+                }
+
+    def __getitem__(self, index: int) -> Tuple[np.ndarray, Optional[np.ndarray], List[str]]:
+        if not (0 <= index < len(self)):
+            raise IndexError
+
+        audio: np.ndarray = self.data[self.indices[index]]['audio']
+        target: str = self.data[self.indices[index]]['category']
+
+        if self.transform is not None:
+            audio = self.transform(audio)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return audio, None, [target]
+
+    def __len__(self) -> int:
+        return len(self.indices)
diff --git a/utils/datasets/us8k.py b/utils/datasets/us8k.py
new file mode 100644
index 0000000..ff45324
--- /dev/null
+++ b/utils/datasets/us8k.py
@@ -0,0 +1,167 @@
+import os
+import warnings
+import multiprocessing as mp
+
+import tqdm
+import librosa
+import soundfile as sf
+
+import numpy as np
+import pandas as pd
+
+import torch.utils.data as td
+
+import sklearn.model_selection as skms
+
+import utils.transforms as transforms
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Optional
+
+
+class UrbanSound8K(td.Dataset):
+
+    def __init__(self,
+                 root: str,
+                 sample_rate: int = 22050,
+                 train: bool = True,
+                 fold: Optional[int] = None,
+                 mono: bool = False,
+                 transform_audio=None,
+                 target_transform=None,
+                 **_):
+
+        super(UrbanSound8K, self).__init__()
+
+        self.root = root
+        self.sample_rate = sample_rate
+        self.train = train
+        self.random_split_seed = None
+
+        if fold is None:
+            fold = 1
+
+        if not (1 <= fold <= 10):
+            raise ValueError(f'Expected fold in range [1, 10], got {fold}')
+
+        self.fold = fold
+        self.folds_to_load = set(range(1, 11))
+
+        if self.fold not in self.folds_to_load:
+            raise ValueError(f'fold {fold} does not exist')
+
+        if self.train:
+            # if in training mode, keep all but test fold
+            self.folds_to_load -= {self.fold}
+        else:
+            # if in evaluation mode, keep the test samples only
+            self.folds_to_load -= self.folds_to_load - {self.fold}
+
+        self.mono = mono
+
+        self.transform = transform_audio
+        self.target_transform = target_transform
+
+        self.data: Dict[str, Dict[str, Any]] = dict()
+        self.indices = dict()
+        self.load_data()
+
+        self.class_idx_to_label = dict()
+        for row in self.data.values():
+            idx = row['target']
+            label = row['category']
+            self.class_idx_to_label[idx] = label
+        self.label_to_class_idx = {lb: idx for idx, lb in self.class_idx_to_label.items()}
+
+    @staticmethod
+    def _load_worker(fn: str, path_to_file: str, sample_rate: int, mono: bool = False) -> Tuple[str, int, np.ndarray]:
+        wav, sample_rate_ = sf.read(
+            path_to_file,
+            dtype='float32',
+            always_2d=True
+        )
+
+        wav = librosa.resample(wav.T, sample_rate_, sample_rate)
+
+        if wav.shape[0] == 1 and not mono:
+            wav = np.concatenate((wav, wav), axis=0)
+
+        wav = wav[:, :sample_rate * 4]
+        wav = transforms.scale(wav, wav.min(), wav.max(), -32768.0, 32767.0)
+
+        return fn, sample_rate, wav.astype(np.float32)
+
+    def load_data(self):
+        # read metadata
+        meta = pd.read_csv(
+            os.path.join(self.root, 'metadata', 'UrbanSound8K.csv'),
+            sep=',',
+            index_col='slice_file_name'
+        )
+
+        for row_idx, (fn, row) in enumerate(meta.iterrows()):
+            path = os.path.join(self.root, 'audio', 'fold{}'.format(row['fold']), fn)
+            self.data[fn] = path, self.sample_rate, self.mono
+
+        # by default, the official split from the metadata is used
+        files_to_load = list()
+        # if the random seed is not None, the random split is used
+        if self.random_split_seed is not None:
+            # given an integer random seed
+            skf = skms.StratifiedKFold(n_splits=10, shuffle=True, random_state=self.random_split_seed)
+
+            # split the US8K samples into 10 folds
+            for fold_idx, (train_ids, test_ids) in enumerate(skf.split(
+                    np.zeros(len(meta)), meta['classID'].values.astype(int)
+            ), 1):
+                # if this is the fold we want to load, add the corresponding files to the list
+                if fold_idx == self.fold:
+                    ids = train_ids if self.train else test_ids
+                    filenames = meta.iloc[ids].index
+                    files_to_load.extend(filenames)
+                    break
+        else:
+            # if the random seed is None, use the official split
+            for fn, row in meta.iterrows():
+                if int(row['fold']) in self.folds_to_load:
+                    files_to_load.append(fn)
+
+        self.data = {fn: vals for fn, vals in self.data.items() if fn in files_to_load}
+        self.indices = {idx: fn for idx, fn in enumerate(self.data)}
+
+        num_processes = os.cpu_count()
+        warnings.filterwarnings('ignore')
+        with mp.Pool(processes=num_processes) as pool:
+            tqdm.tqdm.write(f'Loading {self.__class__.__name__} (train={self.train})')
+            for fn, sample_rate, wav in pool.starmap(
+                func=self._load_worker,
+                iterable=[(fn, path, sr, mono) for fn, (path, sr, mono) in self.data.items()],
+                chunksize=int(np.ceil(len(meta) / num_processes)) or 1
+            ):
+                self.data[fn] = {
+                    'audio': wav,
+                    'sample_rate': sample_rate,
+                    'target': meta.loc[fn, 'classID'],
+                    'category': meta.loc[fn, 'class'].replace('_', ' ').strip(' '),
+                    'background': bool(meta.loc[fn, 'salience'] - 1)
+                }
+
+    def __getitem__(self, index: int) -> Tuple[np.ndarray, Optional[np.ndarray], List[str]]:
+        if not (0 <= index < len(self)):
+            raise IndexError
+
+        audio: np.ndarray = self.data[self.indices[index]]['audio']
+        target: str = self.data[self.indices[index]]['category']
+
+        if self.transform is not None:
+            audio = self.transform(audio)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return audio, None, [target]
+
+    def __len__(self) -> int:
+        return len(self.data)
diff --git a/utils/simple_tokenizer.py b/utils/simple_tokenizer.py
new file mode 100644
index 0000000..68946a1
--- /dev/null
+++ b/utils/simple_tokenizer.py
@@ -0,0 +1,134 @@
+# CREDITS: https://github.com/openai/CLIP
+
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets', 'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
diff --git a/utils/transforms.py b/utils/transforms.py
new file mode 100644
index 0000000..082175e
--- /dev/null
+++ b/utils/transforms.py
@@ -0,0 +1,199 @@
+import math
+
+import numpy as np
+
+import torch
+import torchvision as tv
+
+import ignite_trainer as it
+
+
+def scale(old_value, old_min, old_max, new_min, new_max):
+    old_range = (old_max - old_min)
+    new_range = (new_max - new_min)
+    new_value = (((old_value - old_min) * new_range) / old_range) + new_min
+
+    return new_value
+
+
+def frame_signal(signal: torch.Tensor,
+                 frame_length: int,
+                 hop_length: int,
+                 window: torch.Tensor = None) -> torch.Tensor:
+
+    if window is None:
+        window = torch.ones(frame_length, dtype=signal.dtype, device=signal.device)
+
+    if window.shape[0] != frame_length:
+        raise ValueError('Wrong `window` length: expected {}, got {}'.format(window.shape[0], frame_length))
+
+    signal_length = signal.shape[-1]
+
+    if signal_length <= frame_length:
+        num_frames = 1
+    else:
+        num_frames = 1 + int(math.ceil((1.0 * signal_length - frame_length) / hop_length))
+
+    pad_len = int((num_frames - 1) * hop_length + frame_length)
+    if pad_len > signal_length:
+        zeros = torch.zeros(pad_len - signal_length, device=signal.device, dtype=signal.dtype)
+
+        while zeros.dim() < signal.dim():
+            zeros.unsqueeze_(0)
+
+        pad_signal = torch.cat((zeros.expand(*signal.shape[:-1], -1)[..., :zeros.shape[-1] // 2], signal), dim=-1)
+        pad_signal = torch.cat((pad_signal, zeros.expand(*signal.shape[:-1], -1)[..., zeros.shape[-1] // 2:]), dim=-1)
+    else:
+        pad_signal = signal
+
+    indices = torch.arange(0, frame_length, device=signal.device).repeat(num_frames, 1)
+    indices += torch.arange(
+        0,
+        num_frames * hop_length,
+        hop_length,
+        device=signal.device
+    ).repeat(frame_length, 1).t_()
+    indices = indices.long()
+
+    frames = pad_signal[..., indices]
+    frames = frames * window
+
+    return frames
+
+
+class ToTensor1D(tv.transforms.ToTensor):
+
+    def __call__(self, tensor: np.ndarray):
+        tensor_2d = super(ToTensor1D, self).__call__(tensor[..., np.newaxis])
+
+        return tensor_2d.squeeze_(0)
+
+
+class RandomFlip(it.AbstractTransform):
+
+    def __init__(self, p: float = 0.5):
+        super(RandomFlip, self).__init__()
+
+        self.p = p
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() > 2:
+            flip_mask = torch.rand(x.shape[0], device=x.device) <= self.p
+            x[flip_mask] = x[flip_mask].flip(-1)
+        else:
+            if torch.rand(1) <= self.p:
+                x = x.flip(0)
+
+        return x
+
+
+class RandomScale(it.AbstractTransform):
+
+    def __init__(self, max_scale: float = 1.25):
+        super(RandomScale, self).__init__()
+
+        self.max_scale = max_scale
+
+    @staticmethod
+    def random_scale(max_scale: float, signal: torch.Tensor) -> torch.Tensor:
+        scaling = np.power(max_scale, np.random.uniform(-1, 1))
+        output_size = int(signal.shape[-1] * scaling)
+        ref = torch.arange(output_size, device=signal.device, dtype=signal.dtype).div_(scaling)
+
+        ref1 = ref.clone().type(torch.int64)
+        ref2 = torch.min(ref1 + 1, torch.full_like(ref1, signal.shape[-1] - 1, dtype=torch.int64))
+        r = ref - ref1.type(ref.type())
+        scaled_signal = signal[..., ref1] * (1 - r) + signal[..., ref2] * r
+
+        return scaled_signal
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self.random_scale(self.max_scale, x)
+
+
+class RandomCrop(it.AbstractTransform):
+
+    def __init__(self, out_len: int = 44100, train: bool = True):
+        super(RandomCrop, self).__init__()
+
+        self.out_len = out_len
+        self.train = train
+
+    def random_crop(self, signal: torch.Tensor) -> torch.Tensor:
+        if self.train:
+            left = np.random.randint(0, signal.shape[-1] - self.out_len)
+        else:
+            left = int(round(0.5 * (signal.shape[-1] - self.out_len)))
+
+        orig_std = signal.float().std() * 0.5
+        output = signal[..., left:left + self.out_len]
+
+        out_std = output.float().std()
+        if out_std < orig_std:
+            output = signal[..., :self.out_len]
+
+        new_out_std = output.float().std()
+        if orig_std > new_out_std > out_std:
+            output = signal[..., -self.out_len:]
+
+        return output
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self.random_crop(x) if x.shape[-1] > self.out_len else x
+
+
+class RandomPadding(it.AbstractTransform):
+
+    def __init__(self, out_len: int = 88200, train: bool = True):
+        super(RandomPadding, self).__init__()
+
+        self.out_len = out_len
+        self.train = train
+
+    def random_pad(self, signal: torch.Tensor) -> torch.Tensor:
+        if self.train:
+            left = np.random.randint(0, self.out_len - signal.shape[-1])
+        else:
+            left = int(round(0.5 * (self.out_len - signal.shape[-1])))
+
+        right = self.out_len - (left + signal.shape[-1])
+
+        pad_value_left = signal[..., 0].float().mean().to(signal.dtype)
+        pad_value_right = signal[..., -1].float().mean().to(signal.dtype)
+        output = torch.cat((
+            torch.zeros(signal.shape[:-1] + (left,), dtype=signal.dtype, device=signal.device).fill_(pad_value_left),
+            signal,
+            torch.zeros(signal.shape[:-1] + (right,), dtype=signal.dtype, device=signal.device).fill_(pad_value_right)
+        ), dim=-1)
+
+        return output
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self.random_pad(x) if x.shape[-1] < self.out_len else x
+
+
+class RandomNoise(it.AbstractTransform):
+
+    def __init__(self, snr_min_db: float = -10.0, snr_max_db: float = 100.0, p: float = 0.5):
+        super(RandomNoise, self).__init__()
+
+        self.p = p
+        self.snr_min_db = snr_min_db
+        self.snr_max_db = snr_max_db
+
+    def random_noise(self, signal: torch.Tensor) -> torch.Tensor:
+        target_snr = np.random.rand() * (self.snr_max_db - self.snr_min_db + 1.0) + self.snr_min_db
+
+        signal_watts = torch.mean(signal ** 2, dim=(-1, -2))
+        signal_db = 10 * torch.log10(signal_watts)
+
+        noise_db = signal_db - target_snr
+        noise_watts = 10 ** (noise_db / 10)
+        noise = torch.normal(0.0, noise_watts.item() ** 0.5, signal.shape)
+
+        output = signal + noise
+
+        return output
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return self.random_noise(x) if np.random.rand() <= self.p else x