From 7bf80fb96819e3c51eb3a5087a428dfa1356b830 Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sat, 25 May 2019 12:17:40 +0200
Subject: [PATCH 01/58] Added gitignore and try catch on apex

---
 .gitignore      | 122 ++++++++++++++++++++++++++++++++++++++++++++++++
 modelwrapper.py |  15 +++---
 2 files changed, 131 insertions(+), 6 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7a0e43a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,122 @@
+# Created by .ignore support plugin (hsz.mobi)
+.idea/
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensionsg
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+
diff --git a/modelwrapper.py b/modelwrapper.py
index 3a0ec81..b313603 100644
--- a/modelwrapper.py
+++ b/modelwrapper.py
@@ -14,8 +14,13 @@
 
 import torch.distributed as dist
 import torch.utils.data.distributed
-from apex.fp16_utils import FP16_Optimizer
-from apex.parallel import DistributedDataParallel
+
+try:
+    from apex.fp16_utils import FP16_Optimizer
+    from apex.parallel import DistributedDataParallel
+except Exception as e:
+    print(f"Apex import failed: {e}")
+
 from tqdm import tqdm
 from warpctc_pytorch import CTCLoss
 
@@ -320,7 +325,7 @@ def train(self, seed, cuda, mixed_precision, world_size, gpu_rank, rank, save_fo
                     train_sampler.shuffle(epoch)
 
     def validate(self):
-
+        pass
 
     def test(self):
         torch.set_grad_enabled(False)
@@ -386,8 +391,7 @@ def test(self):
         if save_output:
             np.save(output_path, output_data)
 
-
-def infer(self, sound):
+    def infer(self, sound):
         pass
 
     @staticmethod
@@ -401,7 +405,6 @@ def get_default_path(def_path: str) -> str:
         default = latest_subdir + "/final.pth"
         return default
 
-
     def print_training_info(self, epoch, loss, cer, wer):
         print(f"\nTraining Information\n " + \
               f"- Epoch:\t{epoch}\n " + \

From 855e5162ffcf9984db6c96e3da12a982223a56c8 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sat, 25 May 2019 12:28:20 +0200
Subject: [PATCH 02/58] add requirements and move code to a subfolder

---
 STT_srv.py                 |  43 ----
 config/infer.yaml          |  21 --
 config/test.yaml           |   9 -
 config/train.yaml          |  49 -----
 decoders/__init__.py       |   0
 decoders/beam_decoder.py   |  75 -------
 decoders/decoder.py        |  85 --------
 decoders/greedy_decoder.py |  74 -------
 infer.py                   |  24 ---
 loader.py                  | 248 ----------------------
 models/__init__.py         |   0
 models/deepspeech2.py      | 269 ------------------------
 modelwrapper.py            | 413 -------------------------------------
 test.py                    |  18 --
 train.py                   |  18 --
 utils.py                   |  51 -----
 16 files changed, 1397 deletions(-)
 delete mode 100644 STT_srv.py
 delete mode 100644 config/infer.yaml
 delete mode 100644 config/test.yaml
 delete mode 100644 config/train.yaml
 delete mode 100644 decoders/__init__.py
 delete mode 100644 decoders/beam_decoder.py
 delete mode 100644 decoders/decoder.py
 delete mode 100644 decoders/greedy_decoder.py
 delete mode 100644 infer.py
 delete mode 100644 loader.py
 delete mode 100644 models/__init__.py
 delete mode 100644 models/deepspeech2.py
 delete mode 100644 modelwrapper.py
 delete mode 100644 test.py
 delete mode 100644 train.py
 delete mode 100644 utils.py

diff --git a/STT_srv.py b/STT_srv.py
deleted file mode 100644
index 7013756..0000000
--- a/STT_srv.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import os
-
-from roboy_cognition_msgs.msg import RecognizedSpeech
-from roboy_cognition_msgs.srv import RecognizeSpeech
-
-from asr_interface import IAsr
-import rclpy
-from rclpy.node import Node
-
-
-class SonoscoROS2(Node):
-    def __init__(self):
-        super().__init__('stt')
-        self.publisher = self.create_publisher(RecognizedSpeech, '/roboy/cognition/speech/recognition')
-        self.srv = self.create_service(RecognizeSpeech, '/roboy/cognition/speech/recognition/recognize', self.asr_callback)
-        print("Ready to /roboy/cognition/speech/recognition/recognize")
-        print(f"Roboy Sonosco running with PID: {os.getpid()}")
-        self.i=IAsr()
-        print(f"Status: Speech recognition is ready now!")
-        print("Roboy Sonosco is ready!")
-
-    def asr_callback(self, request, response):
-        response.success = True
-        self.get_logger().info('Incoming Audio')
-        msg = RecognizedSpeech()
-        self.i.inference_audio(request)
-        self.publisher.publish(msg)
-        return response
-
-
-def main(args=None):
-    rclpy.init(args=args)
-
-    stt = SonoscoROS2()
-
-    while rclpy.ok():
-        rclpy.spin_once(stt)
-
-    rclpy.shutdown()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/config/infer.yaml b/config/infer.yaml
deleted file mode 100644
index be61617..0000000
--- a/config/infer.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-infer:
-  model_name: ""
-  audio_path: "" # Audio file to predict on
-
-  sample_rate: 16000 # Sample rate
-  window_size: 0.02 # Window size for spectrogram in seconds
-  window_stride: 0.01 # Window stride for spectrogram in seconds
-  window: 'hamming' # Window type for spectrogram generation
-
-  beam_decoder: False # Turn on beam decoder. otherwise - greedy
-  alpha: 0.8
-  beam_width: 10
-  beta: 1
-  cutoff_prob: 1.0
-  cutoff_top_n: 40
-  lm_path: None # Path to a KenLM binary
-  lm_workers: 1
-  offsets: False # Returns time offset information
-  top_paths: 1
-
-  cuda: True # Use cuda to run model
\ No newline at end of file
diff --git a/config/test.yaml b/config/test.yaml
deleted file mode 100644
index 2589e15..0000000
--- a/config/test.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-test:
-  test_manifest: "" # Path to test manifest csv
-
-  batch_size: 32 # Batch size for testing
-  num_workers: 4 # Number of workers used in loading
-  verbose: True # Print out decoded output and error of each sample
-  save_output: Trur # Saves output of model from test
-  output_path: "" # Where to save raw acoustic output
-
diff --git a/config/train.yaml b/config/train.yaml
deleted file mode 100644
index 25be72c..0000000
--- a/config/train.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-train:
-  train_manifest: 'examples/manifests/train_manifest.csv'
-  val_manifest: 'examples/manifests/val_manifest.csv'
-  labels_path: 'examples/labels.json' # Contains all characters for transcription
-  log_dir: 'logs' # Location for log files
-  def_dir: 'examples/checkpoints/', # Default location to save/load models
-
-  load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune
-
-  sample_rate: 16000 # Sample rate
-  window_size: 0.02 # Window size for spectrogram in seconds
-  window_stride: 0.01 # Window stride for spectrogram in seconds
-  window: 'hamming' # Window type for spectrogram generation
-
-  batch_size: 32 # Batch size for training
-  hidden_size: 800 # Hidden size of RNNs
-  hidden_layers: 5 # Number of RNN layers
-  rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported
-
-  max_epochs: 70 # Number of training epochs
-  learning_rate: 3e-4 # Initial learning rate
-  momentum: 0.9 # Momentum
-  max_norm: 800 # Norm cutoff to prevent explosion of gradients
-  learning_anneal: 1.1n # Annealing applied to learning rate every epoch
-  sortaGrad: True # Turn on ordering of dataset on sequence length for the first epoch
-
-  checkpoint: True # Enables checkpoint saving of model
-  checkpoint_per_epoch: 1 # Save checkpoint per x epochs
-  silent: False # Turn on progress tracking per iteration
-  verbose: False # Turn on verbose progress tracking
-  continue: False # Continue training with a pre-trained model
-  finetune: False # Finetune a pre-trained model
-
-  num_data_workers: 8 # Number of workers used in data-loading
-  augment: False # Use random tempo and gain perturbations
-  shuffle: True # Turn on shuffling and sample from dataset based on sequence length (smallest to largest)
-
-  seed: 123456 # Seed to generators
-  cuda: True # Use cuda to train model
-  half_precision: Trues # Uses half precision to train a model
-  apex: True # Uses mixed precision to train a model
-  static_loss_scaling: False # Static loss scale for mixed precision
-  dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision
-
-  dist_url: 'tcp://127.0.0.1:1550' # URL used to set up distributed training
-  dist_backend: 'nccl' # Distributed backend
-  world_size: 1 # Number of distributed processes
-  rank: 0 # The rank of the current process
-  gpu_rank: 0 # If using distributed parallel for multi_gpu, sets the GPU for the process
\ No newline at end of file
diff --git a/decoders/__init__.py b/decoders/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/decoders/beam_decoder.py b/decoders/beam_decoder.py
deleted file mode 100644
index c44d164..0000000
--- a/decoders/beam_decoder.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env python
-# ----------------------------------------------------------------------------
-# Copyright 2015-2016 Nervana Systems Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ----------------------------------------------------------------------------
-# Modified to support pytorch Tensors
-import torch
-
-from decoders.decoder import Decoder
-
-
-class BeamCTCDecoder(Decoder):
-    def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100,
-                 num_processes=4, blank_index=0):
-        super(BeamCTCDecoder, self).__init__(labels)
-        try:
-            from ctcdecode import CTCBeamDecoder
-        except ImportError:
-            raise ImportError("BeamCTCDecoder requires paddledecoder package.")
-        self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width,
-                                       num_processes, blank_index)
-
-    def convert_to_strings(self, out, seq_len):
-        results = []
-        for b, batch in enumerate(out):
-            utterances = []
-            for p, utt in enumerate(batch):
-                size = seq_len[b][p]
-                if size > 0:
-                    transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size]))
-                else:
-                    transcript = ''
-                utterances.append(transcript)
-            results.append(utterances)
-        return results
-
-    def convert_tensor(self, offsets, sizes):
-        results = []
-        for b, batch in enumerate(offsets):
-            utterances = []
-            for p, utt in enumerate(batch):
-                size = sizes[b][p]
-                if sizes[b][p] > 0:
-                    utterances.append(utt[0:size])
-                else:
-                    utterances.append(torch.tensor([], dtype=torch.int))
-            results.append(utterances)
-        return results
-
-    def decode(self, probs, sizes=None):
-        """
-        Decodes probability output using ctcdecode package.
-        Arguments:
-            probs: Tensor of character probabilities, where probs[c,t]
-                            is the probability of character c at time t
-            sizes: Size of each sequence in the mini-batch
-        Returns:
-            string: sequences of the model's best guess for the transcription
-        """
-        probs = probs.cpu()
-        out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes)
-
-        strings = self.convert_to_strings(out, seq_lens)
-        offsets = self.convert_tensor(offsets, seq_lens)
-        return strings, offsets
\ No newline at end of file
diff --git a/decoders/decoder.py b/decoders/decoder.py
deleted file mode 100644
index 99a8193..0000000
--- a/decoders/decoder.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-# ----------------------------------------------------------------------------
-# Copyright 2015-2016 Nervana Systems Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ----------------------------------------------------------------------------
-# Modified to support pytorch Tensors
-
-import Levenshtein as Lev
-
-
-class Decoder(object):
-    """
-    Basic decoder class from which all other decoders inherit. Implements several
-    helper functions. Subclasses should implement the decode() method.
-
-    Arguments:
-        labels (string): mapping from integers to characters.
-        blank_index (int, optional): index for the blank '_' character. Defaults to 0.
-        space_index (int, optional): index for the space ' ' character. Defaults to 28.
-    """
-
-    def __init__(self, labels, blank_index=0):
-        # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
-        self.labels = labels
-        self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
-        self.blank_index = blank_index
-        space_index = len(labels)  # To prevent errors in decode, we add an out of bounds index for the space
-        if ' ' in labels:
-            space_index = labels.index(' ')
-        self.space_index = space_index
-
-    def wer(self, s1, s2):
-        """
-        Computes the Word Error Rate, defined as the edit distance between the
-        two provided sentences after tokenizing to words.
-        Arguments:
-            s1 (string): space-separated sentence
-            s2 (string): space-separated sentence
-        """
-
-        # build mapping of words to integers
-        b = set(s1.split() + s2.split())
-        word2char = dict(zip(b, range(len(b))))
-
-        # map the words to a char array (Levenshtein packages only accepts
-        # strings)
-        w1 = [chr(word2char[w]) for w in s1.split()]
-        w2 = [chr(word2char[w]) for w in s2.split()]
-
-        return Lev.distance(''.join(w1), ''.join(w2))
-
-    def cer(self, s1, s2):
-        """
-        Computes the Character Error Rate, defined as the edit distance.
-
-        Arguments:
-            s1 (string): space-separated sentence
-            s2 (string): space-separated sentence
-        """
-        s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
-        return Lev.distance(s1, s2)
-
-    def decode(self, probs, sizes=None):
-        """
-        Given a matrix of character probabilities, returns the decoder's
-        best guess of the transcription
-
-        Arguments:
-            probs: Tensor of character probabilities, where probs[c,t]
-                            is the probability of character c at time t
-            sizes(optional): Size of each sequence in the mini-batch
-        Returns:
-            string: sequence of the model's best guess for the transcription
-        """
-        raise NotImplementedError
diff --git a/decoders/greedy_decoder.py b/decoders/greedy_decoder.py
deleted file mode 100644
index c14884f..0000000
--- a/decoders/greedy_decoder.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env python
-# ----------------------------------------------------------------------------
-# Copyright 2015-2016 Nervana Systems Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ----------------------------------------------------------------------------
-# Modified to support pytorch Tensors
-
-import torch
-
-from decoders.decoder import Decoder
-
-
-class GreedyDecoder(Decoder):
-    def __init__(self, labels, blank_index=0):
-        super(GreedyDecoder, self).__init__(labels, blank_index)
-
-    def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False):
-        """Given a list of numeric sequences, returns the corresponding strings"""
-        strings = []
-        offsets = [] if return_offsets else None
-        for x in range(len(sequences)):
-            seq_len = sizes[x] if sizes is not None else len(sequences[x])
-            string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions)
-            strings.append([string])  # We only return one path
-            if return_offsets:
-                offsets.append([string_offsets])
-        if return_offsets:
-            return strings, offsets
-        else:
-            return strings
-
-    def process_string(self, sequence, size, remove_repetitions=False):
-        string = ''
-        offsets = []
-        for i in range(size):
-            char = self.int_to_char[sequence[i].item()]
-            if char != self.int_to_char[self.blank_index]:
-                # if this char is a repetition and remove_repetitions=true, then skip
-                if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]:
-                    pass
-                elif char == self.labels[self.space_index]:
-                    string += ' '
-                    offsets.append(i)
-                else:
-                    string = string + char
-                    offsets.append(i)
-        return string, torch.tensor(offsets, dtype=torch.int)
-
-    def decode(self, probs, sizes=None):
-        """
-        Returns the argmax decoding given the probability matrix. Removes
-        repeated elements in the sequence, as well as blanks.
-
-        Arguments:
-            probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim
-            sizes(optional): Size of each sequence in the mini-batch
-        Returns:
-            strings: sequences of the model's best guess for the transcription on inputs
-            offsets: time step per character predicted
-        """
-        _, max_probs = torch.max(probs, 2)
-        strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes,
-                                                   remove_repetitions=True, return_offsets=True)
-        return strings, offsets
diff --git a/infer.py b/infer.py
deleted file mode 100644
index 8fe03d2..0000000
--- a/infer.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import argparse
-import os
-import wave
-from typing import Dict
-
-import yaml
-
-from modelwrapper import ModelWrapper
-
-parser = argparse.ArgumentParser(description='ASR inference')
-parser.add_argument('--config', metavar='DIR',
-                    help='Path to inference config file', default='config/infer.yaml')
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    with open(args.config, 'r') as file:
-        config = yaml.load(file)
-    config_dict: Dict = config["infer"]
-    model = ModelWrapper(**config_dict)
-    if "wave_path" in config_dict.keys() and os.path.isfile(config_dict.get("wave_path")):
-        sound = wave.open(config_dict.get("wave_path"))
-        print(model.infer(sound))
-    else:
-        print("Wave file not found!")
diff --git a/loader.py b/loader.py
deleted file mode 100644
index 00b06d1..0000000
--- a/loader.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# ----------------------------------------------------------------------------
-# Based on SeanNaren's deepspeech.pytorch:
-# https://github.com/SeanNaren/deepspeech.pytorch
-# ----------------------------------------------------------------------------
-
-import math
-import warnings
-from typing import Tuple
-
-import librosa
-import numpy as np
-import torch
-import torchaudio
-from scipy import signal
-from torch.utils.data import Dataset, DataLoader, Sampler
-from torch.distributed import get_rank
-from torch.distributed import get_world_size
-
-windows = {"bartlett": torch.bartlett_window,
-           "blackman": torch.blackman_window,
-           "hamming": torch.hamming_window,
-           "hann": torch.hann_window}
-
-windows_legacy = {'hamming': signal.hamming,
-                  'hann': signal.hann,
-                  'blackman': signal.blackman,
-                  'bartlett': signal.bartlett}
-
-
-class DataProcessor(object):
-    def __init__(self, audio_conf, labels="abc", normalize=False, augment=False, legacy=True):
-        """
-        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
-        a comma. Each new line is a different sample. Example below:
-
-        /path/to/audio.wav,/path/to/audio.txt
-        ...
-
-        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
-        :param labels: String containing all the possible characters to map to
-        :param normalize: Apply standard mean and deviation normalization to audio tensor
-        :param augment(default False):  Apply random tempo and gain perturbations
-        """
-        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
-        self.window_stride = audio_conf["window_stride"]
-        self.window_size = audio_conf["window_size"]
-        self.sample_rate = audio_conf["sample_rate"]
-        self.window = windows_legacy.get(audio_conf["window"], windows_legacy["hamming"]) if legacy else windows.get(audio_conf["window"], windows["hamming"])
-        self.normalize = normalize
-        self.augment = augment
-        self.legacy = legacy
-        self.transform = torchaudio.transforms.Spectrogram(n_fft=int(self.sample_rate * self.window_size),
-                                                           hop=int(self.sample_rate * self.window_stride),
-                                                           window=self.window, normalize=self.normalize)
-
-    @staticmethod
-    def retrieve_file(audio_path, legacy=True):
-        sound, sample_rate = torchaudio.load(audio_path)
-        if legacy:
-            sound = sound.numpy().T
-            if len(sound.shape) > 1:
-                if sound.shape[1] == 1:
-                    sound = sound.squeeze()
-                else:
-                    sound = sound.mean(axis=1)
-        return sound, sample_rate
-
-    @staticmethod
-    def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)):
-        """
-        Changes tempo and gain of the wave
-        """
-        warnings.warn("Augmentation is not implemented")  # TODO: Implement
-        return sound
-
-    def parse_audio(self, audio_path):
-        sound, sample_rate = self.retrieve_file(audio_path, self.legacy)
-        if sample_rate != self.sample_rate:
-            raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
-
-        if self.augment:
-            sound = self.augment_audio(sound)
-
-        if self.legacy:
-            n_fft = int(self.sample_rate * self.window_size)
-            win_length = n_fft
-            hop_length = int(self.sample_rate * self.window_stride)
-            # STFT
-            D = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
-                             win_length=win_length, window=self.window)
-            spectrogram, phase = librosa.magphase(D)
-            # S = log(S+1)
-
-            spectrogram = torch.FloatTensor(np.log1p(spectrogram))
-        else:
-            # TODO: Why these are different from librosa.stft?
-            sound = sound.cuda()
-            spectrogram = self.transform(sound)[-1, :, :].transpose(0, 1)
-
-            # spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()),
-            #                          n_fft=int(self.sample_rate * self.window_size),
-            #                          hop_length=int(self.sample_rate * self.window_stride),
-            #                          win_length=int(self.sample_rate * self.window_size),
-            #                          window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1]
-
-        if self.normalize:
-            mean = spectrogram.mean()
-            std = spectrogram.std()
-            spectrogram.add_(-mean)
-            spectrogram.div_(std)
-
-        return spectrogram
-
-    def parse_transcript(self, transcript_path):
-        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
-            transcript = transcript_file.read().replace('\n', '')
-        # TODO: Is it fast enough?
-        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
-        return transcript
-
-
-class AudioDataset(Dataset):
-    def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False, legacy=True):
-        """
-        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
-        a comma. Each new line is a different sample. Example below:
-
-        /path/to/audio.wav,/path/to/audio.txt
-        ...
-
-        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
-        :param manifest_filepath: Path to manifest csv as describe above
-        :param labels: String containing all the possible characters to map to
-        :param normalize: Apply standard mean and deviation normalization to audio tensor
-        :param augment(default False):  Apply random tempo and gain perturbations
-        """
-        super(AudioDataset, self).__init__()
-        with open(manifest_filepath) as f:
-            ids = f.readlines()
-        ids = [x.strip().split(',') for x in ids]
-        self.ids = ids
-        self.size = len(ids)
-        self.processor = DataProcessor(audio_conf, labels, normalize, augment, legacy)
-
-    def __getitem__(self, index):
-        sample = self.ids[index]
-        audio_path, transcript_path = sample[0], sample[1]
-
-        spectrogram = self.processor.parse_audio(audio_path)
-        transcript = self.processor.parse_transcript(transcript_path)
-
-        return spectrogram, transcript
-
-    def __len__(self):
-        return self.size
-
-
-# TODO: Optimise
-def _collate_fn(batch):
-    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
-    longest_sample = batch[0][0]
-    freq_size, max_seqlength = longest_sample.size()
-    minibatch_size = len(batch)
-    inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
-    input_percentages = torch.FloatTensor(minibatch_size)
-    target_sizes = np.zeros(minibatch_size, dtype=np.int32)
-
-    # TODO: Numpy broadcasting magic
-    targets = []
-
-    for x in range(minibatch_size):
-        inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
-        input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
-        target_sizes[x] = len(batch[x][1])
-        targets.extend(batch[x][1])
-
-    return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
-
-
-class AudioDataLoader(DataLoader):
-    def __init__(self, *args, **kwargs):
-        """
-        Creates a data loader for AudioDatasets.
-        """
-        super(AudioDataLoader, self).__init__(*args, **kwargs)
-        self.collate_fn = _collate_fn
-
-
-class BucketingSampler(Sampler):
-    def __init__(self, data_source, batch_size=1):
-        """
-        Samples batches assuming they are in order of size to batch similarly sized samples together.
-        """
-        super(BucketingSampler, self).__init__(data_source)
-        self.data_source = data_source
-        ids = list(range(0, len(data_source)))
-        # TODO: Optimise
-        self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]
-
-    def __iter__(self):
-        for ids in self.bins:
-            np.random.shuffle(ids)
-            yield ids
-
-    def __len__(self):
-        return len(self.bins)
-
-    def shuffle(self, epoch):
-        np.random.shuffle(self.bins)
-
-
-# TODO: Optimise
-class DistributedBucketingSampler(Sampler):
-    def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
-        """
-        Samples batches assuming they are in order of size to batch similarly sized samples together.
-        """
-        super(DistributedBucketingSampler, self).__init__(data_source)
-        if num_replicas is None:
-            num_replicas = get_world_size()
-        if rank is None:
-            rank = get_rank()
-        self.data_source = data_source
-        self.ids = list(range(0, len(data_source)))
-        self.batch_size = batch_size
-        self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
-        self.total_size = self.num_samples * self.num_replicas
-
-    def __iter__(self):
-        offset = self.rank
-        # add extra samples to make it evenly divisible
-        bins = self.bins + self.bins[:(self.total_size - len(self.bins))]
-        assert len(bins) == self.total_size
-        samples = bins[offset::self.num_replicas]  # Get every Nth bin, starting from rank
-        return iter(samples)
-
-    def __len__(self):
-        return self.num_samples
-
-    def shuffle(self, epoch):
-        # deterministically shuffle based on epoch
-        g = torch.Generator()
-        g.manual_seed(epoch)
-        bin_ids = list(torch.randperm(len(self.bins), generator=g))
-        self.bins = [self.bins[i] for i in bin_ids]
diff --git a/models/__init__.py b/models/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/models/deepspeech2.py b/models/deepspeech2.py
deleted file mode 100644
index dcfa100..0000000
--- a/models/deepspeech2.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# ----------------------------------------------------------------------------
-# Based on SeanNaren's deepspeech.pytorch:
-# https://github.com/SeanNaren/deepspeech.pytorch
-# ----------------------------------------------------------------------------
-
-import math
-from collections import OrderedDict
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-supported_rnns = {
-    'lstm': nn.LSTM,
-    'rnn': nn.RNN,
-    'gru': nn.GRU
-}
-supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())
-
-
-class SequenceWise(nn.Module):
-    def __init__(self, module):
-        """
-        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
-        Allows handling of variable sequence lengths and minibatch sizes.
-        :param module: Module to apply input to.
-        """
-        super(SequenceWise, self).__init__()
-        self.module = module
-
-    def forward(self, x):
-        t, n = x.size(0), x.size(1)
-        x = x.view(t * n, -1)
-        x = self.module(x)
-        x = x.view(t, n, -1)
-        return x
-
-    def __repr__(self):
-        tmpstr = self.__class__.__name__ + ' (\n'
-        tmpstr += self.module.__repr__()
-        tmpstr += ')'
-        return tmpstr
-
-
-class MaskConv(nn.Module):
-    def __init__(self, seq_module):
-        """
-        Adds padding to the output of the module based on the given lengths. This is to ensure that the
-        results of the model do not change when batch sizes change during inference.
-        Input needs to be in the shape of (BxCxDxT)
-        :param seq_module: The sequential module containing the conv stack.
-        """
-        super(MaskConv, self).__init__()
-        self.seq_module = seq_module
-
-    def forward(self, x, lengths):
-        """
-        :param x: The input of size BxCxDxT
-        :param lengths: The actual length of each sequence in the batch
-        :return: Masked output from the module
-        """
-        for module in self.seq_module:
-            x = module(x)
-            mask = torch.ByteTensor(x.size()).fill_(0)
-            if x.is_cuda:
-                mask = mask.cuda()
-            for i, length in enumerate(lengths):
-                length = length.item()
-                if (mask[i].size(2) - length) > 0:
-                    mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
-            x = x.masked_fill(mask, 0)
-        return x, lengths
-
-
-class InferenceBatchSoftmax(nn.Module):
-    def forward(self, input_):
-        if not self.training:
-            return F.softmax(input_, dim=-1)
-        else:
-            return input_
-
-
-class BatchRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True):
-        super(BatchRNN, self).__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
-        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
-                            bidirectional=True, bias=True)
-
-    def flatten_parameters(self):
-        self.rnn.flatten_parameters()
-
-    def forward(self, x, output_lengths):
-        if self.batch_norm is not None:
-            x = self.batch_norm(x)
-        x = nn.utils.rnn.pack_padded_sequence(x, output_lengths)
-        x, h = self.rnn(x)
-        x, _ = nn.utils.rnn.pad_packed_sequence(x)
-        if self.bidirectional:
-            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
-        return x
-
-
-class DeepSpeech2(nn.Module):
-    def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5, audio_conf=None,
-                 bidirectional=True):
-        super(DeepSpeech2, self).__init__()
-
-        # model metadata needed for serialization/deserialization
-        if audio_conf is None:
-            audio_conf = {}
-        self.version = '0.0.1'
-        self.hidden_size = rnn_hid_size
-        self.hidden_layers = nb_layers
-        self.rnn_type = rnn_type
-        self.audio_conf = audio_conf or {}
-        self.labels = labels
-        self.bidirectional = bidirectional
-        # self.mixed_precision = mixed_precision
-
-        sample_rate = self.audio_conf.get("sample_rate", 16000)
-        window_size = self.audio_conf.get("window_size", 0.02)
-        num_classes = len(self.labels)
-
-        self.conv = MaskConv(nn.Sequential(
-            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
-            nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True),
-            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
-            nn.BatchNorm2d(32),
-            nn.Hardtanh(0, 20, inplace=True)
-        ))
-        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
-        rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1)
-        rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1)
-        rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1)
-        rnn_in_size *= 32
-
-        rnns = [('0', BatchRNN(input_size=rnn_in_size, hidden_size=rnn_hid_size, rnn_type=rnn_type, batch_norm=False))]
-        rnns.extend([(f"{x + 1}", BatchRNN(input_size=rnn_hid_size, hidden_size=rnn_hid_size, rnn_type=rnn_type))
-                     for x in range(nb_layers - 1)])
-        self.rnns = nn.Sequential(OrderedDict(rnns))
-
-        fully_connected = nn.Sequential(
-            nn.BatchNorm1d(rnn_hid_size),
-            nn.Linear(rnn_hid_size, num_classes, bias=False)
-        )
-
-        self.fc = nn.Sequential(
-            SequenceWise(fully_connected),
-        )
-
-        self.inference_softmax = InferenceBatchSoftmax()
-
-    def forward(self, x, lengths):
-        # if x.is_cuda and self.mixed_precision:
-        #     x = x.half()
-        lengths = lengths.cpu().int()
-        output_lengths = self.get_seq_lens(lengths)
-        x, _ = self.conv(x, output_lengths)
-
-        sizes = x.size()
-        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
-        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
-
-        for rnn in self.rnns:
-            x = rnn(x, output_lengths)
-
-        if not self.bidirectional:  # no need for lookahead layer in bidirectional
-            x = self.lookahead(x)
-
-        x = self.fc(x)
-        x = x.transpose(0, 1)
-        # identity in training mode, softmax in eval mode
-        x = self.inference_softmax(x)
-        return x, output_lengths
-
-    def get_seq_lens(self, input_length):
-        """
-        Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
-        containing the size sequences that will be output by the network.
-        :param input_length: 1D Tensor
-        :return: 1D Tensor scaled by model
-        """
-        seq_len = input_length
-        for m in self.conv.modules():
-            if type(m) == nn.modules.conv.Conv2d:
-                seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
-        return seq_len.int()
-
-    @classmethod
-    def load_model(cls, path):
-        package = torch.load(path, map_location=lambda storage, loc: storage)
-        model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'],
-                    labels=package['labels'], audio_conf=package['audio_conf'],
-                    rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))
-        model.load_state_dict(package['state_dict'])
-        for x in model.rnns:
-            x.flatten_parameters()
-
-        return model
-
-    @classmethod
-    def load_model_package(cls, package):
-        model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'],
-                    labels=package['labels'], audio_conf=package['audio_conf'],
-                    rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))
-        model.load_state_dict(package['state_dict'])
-
-        return model
-
-    @staticmethod
-    def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None,
-                  cer_results=None, wer_results=None, avg_loss=None, meta=None):
-        package = {
-            'version': model.version,
-            'hidden_size': model.hidden_size,
-            'hidden_layers': model.hidden_layers,
-            'rnn_type': supported_rnns_inv.get(model.rnn_type, model.rnn_type.__name__.lower()),
-            'audio_conf': model.audio_conf,
-            'labels': model.labels,
-            'state_dict': model.state_dict(),
-            'bidirectional': model.bidirectional
-        }
-        if optimizer is not None:
-            package['optim_dict'] = optimizer.state_dict()
-        if avg_loss is not None:
-            package['avg_loss'] = avg_loss
-        if epoch is not None:
-            package['epoch'] = epoch + 1  # increment for readability
-        if iteration is not None:
-            package['iteration'] = iteration
-        if loss_results is not None:
-            package['loss_results'] = loss_results
-            package['cer_results'] = cer_results
-            package['wer_results'] = wer_results
-        if meta is not None:
-            package['meta'] = meta
-        return package
-
-    @staticmethod
-    def get_param_size(model):
-        params = 0
-        for p in model.parameters():
-            tmp = 1
-            for x in p.size():
-                tmp *= x
-            params += tmp
-        return params
-
-    def __repr__(self):
-        rep = f"DeepSpeech2 version: {self.version}\n" + \
-               "=======================================" + \
-               "Recurrent Neural Network Properties\n" + \
-               f"  RNN Type:  \t{self.rnn_type.__name__.lower()}\n" + \
-               f"  RNN Layers:\t{self.hidden_layers}\n" + \
-               f"  RNN Size:  \t{self.hidden_size}\n" + \
-               f"  Classes:   \t{len(self.labels)}\n" + \
-               "---------------------------------------\n" + \
-               "Model Features\n" + \
-               f"  Labels:       \t{self.labels}\n" + \
-               f"  Sample Rate:  \t{self.audio_conf.get('sample_rate', 'n/a')}\n" + \
-               f"  Window Type:  \t{self.audio_conf.get('window', 'n/a')}\n" + \
-               f"  Window Size:  \t{self.audio_conf.get('window_size', 'n/a')}\n" + \
-               f"  Window Stride:\t{self.audio_conf.get('window_stride', 'n/a')}"
-        return rep
diff --git a/modelwrapper.py b/modelwrapper.py
deleted file mode 100644
index b313603..0000000
--- a/modelwrapper.py
+++ /dev/null
@@ -1,413 +0,0 @@
-import os.path
-from random import random
-from datetime import datetime
-
-import numpy as np
-import torch
-
-from models.deepspeech2 import DeepSpeech2
-
-import json
-import os
-import random
-import time
-
-import torch.distributed as dist
-import torch.utils.data.distributed
-
-try:
-    from apex.fp16_utils import FP16_Optimizer
-    from apex.parallel import DistributedDataParallel
-except Exception as e:
-    print(f"Apex import failed: {e}")
-
-from tqdm import tqdm
-from warpctc_pytorch import CTCLoss
-
-from loader import AudioDataLoader, AudioDataset, BucketingSampler, DistributedBucketingSampler
-from decoders.greedy_decoder import GreedyDecoder
-from utils import convert_model_to_half, reduce_tensor, check_loss
-
-models = {"deepspeech2": DeepSpeech2}
-
-sttime = datetime.now()
-print(f"Time of start: {sttime}")
-
-
-def to_np(x):
-    return x.cpu().numpy()
-
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-
-class ModelWrapper(object):
-    DEF_PATH = "examples/checkpoints/"
-
-    def __init__(self, **kwargs):
-        self.model = kwargs.get("model", models["deepspeech2"])
-
-        if kwargs.get("continue"):
-            path = kwargs.get("from", ModelWrapper.get_default_path())
-            self.model.package = torch.load(path, map_location=lambda storage, loc: storage)
-            self.model.load_model(path)
-
-        self.save_path = kwargs.get("save", ModelWrapper.DEF_PATH + str(datetime.now().timestamp()))
-
-        self.cuda = kwargs.get("cuda")
-        self.apex = kwargs.get("apex") if self.cuda else False
-        self.half = self.apex if self.apex else kwargs.get("half")
-
-    def train(self, seed, cuda, mixed_precision, world_size, gpu_rank, rank, save_folder, dist_backend, dist_url,
-              epochs, continue_from, finetune, labels_path, sample_rate, window_size, window_stride, window,
-              hidden_size, hidden_layers, labels, supported_rnns, bidirectional, no_shuffle, no_sorta_grad, rnn_type,
-              train_manifest, augment, batch_size, num_workers, momentum, lr, static_loss_scale, dynamic_loss_scale,
-              val_manifest, max_norm, silent, checkpoint_per_batch, checkpoint, learning_anneal, model_path):
-        # Set seeds for determinism
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        np.random.seed(seed)
-        random.seed(seed)
-
-        device = torch.device("cuda" if cuda else "cpu")
-        if mixed_precision and not cuda:
-            raise ValueError('If using mixed precision training, CUDA must be enabled!')
-        distributed = world_size > 1
-        main_proc = True
-        device = torch.device("cuda" if cuda else "cpu")
-        if distributed:
-            if gpu_rank:
-                torch.cuda.set_device(int(gpu_rank))
-            dist.init_process_group(backend=dist_backend, init_method=dist_url,
-                                    world_size=world_size, rank=rank)
-            main_proc = rank == 0  # Only the first proc should save models
-        save_folder = save_folder
-        os.makedirs(save_folder, exist_ok=True)  # Ensure save folder exists
-
-        loss_results, cer_results, wer_results = torch.Tensor(epochs), torch.Tensor(epochs), torch.Tensor(epochs)
-        best_wer = None
-
-        avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None
-        if continue_from:  # Starting from previous model
-            print("Loading checkpoint model %s" % continue_from)
-
-            labels = self.model.labels
-            audio_conf = self.model.audio_conf
-            if not finetune:  # Don't want to restart training
-                optim_state = self.model.package['optim_dict']
-                start_epoch = int(self.model.get('epoch', 1)) - 1  # Index start at 0 for training
-                start_iter = self.model.package.get('iteration', None)
-                if start_iter is None:
-                    start_epoch += 1  # We saved model after epoch finished, start at the next epoch.
-                    start_iter = 0
-                else:
-                    start_iter += 1
-                avg_loss = int(self.model.package.get('avg_loss', 0))
-                loss_results, cer_results, wer_results = self.model.package['loss_results'], \
-                                                         self.model.package['cer_results'], \
-                                                         self.model.package['wer_results']
-        else:
-            with open(labels_path) as label_file:
-                labels = str(''.join(json.load(label_file)))
-
-            audio_conf = dict(sample_rate=sample_rate,
-                              window_size=window_size,
-                              window_stride=window_stride,
-                              window=window)
-
-            rnn_type = rnn_type.lower()
-            assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
-            model = self.model(rnn_hidden_size=hidden_size,
-                               nb_layers=hidden_layers,
-                               labels=labels,
-                               rnn_type=supported_rnns[rnn_type],
-                               audio_conf=audio_conf,
-                               bidirectional=bidirectional,
-                               mixed_precision=mixed_precision)
-
-        decoder = GreedyDecoder(labels)
-
-        train_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels,
-                                     normalize=False, augment=augment)
-        test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels,
-                                    normalize=False, augment=False)
-        if not distributed:
-            train_sampler = BucketingSampler(train_dataset, batch_size=batch_size)
-        else:
-            train_sampler = DistributedBucketingSampler(train_dataset, batch_size=batch_size,
-                                                        num_replicas=world_size, rank=rank)
-
-        train_loader = AudioDataLoader(train_dataset, num_workers=num_workers, batch_sampler=train_sampler)
-        test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)
-
-        if (not no_shuffle and start_epoch != 0) or no_sorta_grad:
-            print("Shuffling batches for the following epochs")
-            train_sampler.shuffle(start_epoch)
-
-        model = model.to(device)
-        if mixed_precision:
-            model = convert_model_to_half(model)
-        parameters = model.parameters()
-        optimizer = torch.optim.SGD(parameters, lr=lr,
-                                    momentum=momentum, nesterov=True, weight_decay=1e-5)
-        if distributed:
-            model = DistributedDataParallel(model)
-        if mixed_precision:
-            optimizer = FP16_Optimizer(optimizer,
-                                       static_loss_scale=static_loss_scale,
-                                       dynamic_loss_scale=dynamic_loss_scale)
-        if optim_state is not None:
-            optimizer.load_state_dict(optim_state)
-        print(model)
-        print("Number of parameters: %d" % self.model.get_param_size(model))
-
-        criterion = CTCLoss()
-        batch_time = AverageMeter()
-        data_time = AverageMeter()
-        losses = AverageMeter()
-
-        for epoch in range(start_epoch, epochs):
-            model.train()
-            end = time.time()
-            start_epoch_time = time.time()
-            for i, (data) in enumerate(train_loader, start=start_iter):
-                if i == len(train_sampler):
-                    break
-                inputs, targets, input_percentages, target_sizes = data
-                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
-                # measure data loading time
-                data_time.update(time.time() - end)
-                inputs = inputs.to(device)
-
-                out, output_sizes = model(inputs, input_sizes)
-                out = out.transpose(0, 1)  # TxNxH
-
-                float_out = out.float()  # ensure float32 for loss
-                loss = criterion(float_out, targets, output_sizes, target_sizes).to(device)
-                loss = loss / inputs.size(0)  # average the loss by minibatch
-
-                if distributed:
-                    loss = loss.to(device)
-                    loss_value = reduce_tensor(loss, world_size).item()
-                else:
-                    loss_value = loss.item()
-
-                # Check to ensure valid loss was calculated
-                valid_loss, error = check_loss(loss, loss_value)
-                if valid_loss:
-                    optimizer.zero_grad()
-                    # compute gradient
-                    if mixed_precision:
-                        optimizer.backward(loss)
-                        optimizer.clip_master_grads(max_norm)
-                    else:
-                        loss.backward()
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-                    optimizer.step()
-                else:
-                    print(error)
-                    print('Skipping grad update')
-                    loss_value = 0
-
-                avg_loss += loss_value
-                losses.update(loss_value, inputs.size(0))
-
-                # measure elapsed time
-                batch_time.update(time.time() - end)
-                end = time.time()
-                if not silent:
-                    print('Epoch: [{0}][{1}/{2}]\t'
-                          'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
-                          'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
-                          'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
-                        (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=data_time,
-                        loss=losses))
-                if checkpoint_per_batch > 0 and i > 0 and (i + 1) % checkpoint_per_batch == 0 and main_proc:
-                    file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (save_folder, epoch + 1, i + 1)
-                    print("Saving checkpoint model to %s" % file_path)
-                    torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i,
-                                                    loss_results=loss_results,
-                                                    wer_results=wer_results, cer_results=cer_results,
-                                                    avg_loss=avg_loss),
-                               file_path)
-                del loss, out, float_out
-
-            avg_loss /= len(train_sampler)
-
-            epoch_time = time.time() - start_epoch_time
-            print(f"Elapsed time from start: {datetime.now() - sttime}")
-            print('Training Summary Epoch: [{0}]\t'
-                  'Time taken (s): {epoch_time:.0f}\t'
-                  'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss))
-
-            start_iter = 0  # Reset start iteration for next epoch
-            total_cer, total_wer = 0, 0
-            model.eval()
-            with torch.no_grad():
-                for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
-                    inputs, targets, input_percentages, target_sizes = data
-                    input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
-                    inputs = inputs.to(device)
-
-                    # unflatten targets
-                    split_targets = []
-                    offset = 0
-                    for size in target_sizes:
-                        split_targets.append(targets[offset:offset + size])
-                        offset += size
-
-                    out, output_sizes = model(inputs, input_sizes)
-
-                    decoded_output, _ = decoder.decode(out, output_sizes)
-                    target_strings = decoder.convert_to_strings(split_targets)
-                    wer, cer = 0, 0
-                    for x in range(len(target_strings)):
-                        transcript, reference = decoded_output[x][0], target_strings[x][0]
-                        wer += decoder.wer(transcript, reference) / float(len(reference.split()))
-                        cer += decoder.cer(transcript, reference) / float(len(reference))
-                    total_cer += cer
-                    total_wer += wer
-                    del out
-                wer = total_wer / len(test_loader.dataset)
-                cer = total_cer / len(test_loader.dataset)
-                wer *= 100
-                cer *= 100
-                loss_results[epoch] = avg_loss
-                wer_results[epoch] = wer
-                cer_results[epoch] = cer
-                print('Validation Summary Epoch: [{0}]\t'
-                      'Average WER {wer:.3f}\t'
-                      'Average CER {cer:.3f}\t'.format(
-                    epoch + 1, wer=wer, cer=cer))
-
-            values = {
-                'loss_results': loss_results,
-                'cer_results': cer_results,
-                'wer_results': wer_results
-            }
-
-            if main_proc and checkpoint:
-                file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
-                torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
-                                                wer_results=wer_results, cer_results=cer_results),
-                           file_path)
-            # anneal lr
-            param_groups = optimizer.optimizer.param_groups if mixed_precision else optimizer.param_groups
-            for g in param_groups:
-                g['lr'] = g['lr'] / learning_anneal
-            print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))
-
-            if main_proc and (best_wer is None or best_wer > wer):
-                print("Found better validated model, saving to %s" % model_path)
-                torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
-                                                wer_results=wer_results, cer_results=cer_results), model_path)
-                best_wer = wer
-
-                avg_loss = 0
-                if not no_shuffle:
-                    print("Shuffling batches...")
-                    train_sampler.shuffle(epoch)
-
-    def validate(self):
-        pass
-
-    def test(self):
-        torch.set_grad_enabled(False)
-        device = torch.device("cuda" if cuda else "cpu")
-        model = load_model(device, model_path, cuda)
-
-        if decoder == "beam":
-            from decoder import BeamCTCDecoder
-
-            decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, alpha=alpha, beta=beta,
-                                     cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob,
-                                     beam_width=beam_width, num_processes=lm_workers)
-        elif decoder == "greedy":
-            decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))
-        else:
-            decoder = None
-        target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))
-        test_dataset = AudioDataset(audio_conf=model.audio_conf, manifest_filepath=test_manifest,
-                                    labels=model.labels, normalize=True)
-        test_loader = AudioDataLoader(test_dataset, batch_size=batch_size,
-                                      num_workers=num_workers)
-        total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
-        output_data = []
-        for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
-            inputs, targets, input_percentages, target_sizes = data
-            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
-            inputs = inputs.to(device)
-            # unflatten targets
-            split_targets = []
-            offset = 0
-            for size in target_sizes:
-                split_targets.append(targets[offset:offset + size])
-                offset += size
-
-            out, output_sizes = model(inputs, input_sizes)
-
-            if save_output:
-                # add output to data array, and continue
-                output_data.append((out.cpu().numpy(), output_sizes.numpy()))
-
-            decoded_output, _ = decoder.decode(out, output_sizes)
-            target_strings = target_decoder.convert_to_strings(split_targets)
-            for x in range(len(target_strings)):
-                transcript, reference = decoded_output[x][0], target_strings[x][0]
-                wer_inst = decoder.wer(transcript, reference)
-                cer_inst = decoder.cer(transcript, reference)
-                total_wer += wer_inst
-                total_cer += cer_inst
-                num_tokens += len(reference.split())
-                num_chars += len(reference)
-                if verbose:
-                    print("Ref:", reference.lower())
-                    print("Hyp:", transcript.lower())
-                    print("WER:", float(wer_inst) / len(reference.split()), "CER:", float(cer_inst) / len(reference),
-                          "\n")
-
-        wer = float(total_wer) / num_tokens
-        cer = float(total_cer) / num_chars
-
-        print('Test Summary \t'
-              'Average WER {wer:.3f}\t'
-              'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100))
-        if save_output:
-            np.save(output_path, output_data)
-
-    def infer(self, sound):
-        pass
-
-    @staticmethod
-    def get_default_path(def_path: str) -> str:
-        """
-        Returns the path to the latest checkpoint in the default location
-        :param def_path: default path where checkpoints are stored
-        :return: the path to the latest checkpoint
-        """
-        latest_subdir = max([os.path.join(def_path, d) for d in os.listdir(def_path)], key=os.path.getmtime)
-        default = latest_subdir + "/final.pth"
-        return default
-
-    def print_training_info(self, epoch, loss, cer, wer):
-        print(f"\nTraining Information\n " + \
-              f"- Epoch:\t{epoch}\n " + \
-              f"- Current Loss:\t{loss}\n " + \
-              f"- Current CER: \t{cer}\n" + \
-              f"- Current WER: \t{wer}")
diff --git a/test.py b/test.py
deleted file mode 100644
index f4d8a39..0000000
--- a/test.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import argparse
-from typing import Dict
-
-import yaml
-
-from modelwrapper import ModelWrapper
-
-parser = argparse.ArgumentParser(description='ASR testing')
-parser.add_argument('--config', metavar='DIR',
-                    help='Path to test config file', default='config/test.yaml')
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    with open(args.config, 'r') as file:
-        config = yaml.load(file)
-    config_dict: Dict = config["test"]
-    model = ModelWrapper(**config_dict)
-    model.test()
diff --git a/train.py b/train.py
deleted file mode 100644
index a763a47..0000000
--- a/train.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import argparse
-from typing import Dict
-
-import yaml
-
-from modelwrapper import ModelWrapper
-
-parser = argparse.ArgumentParser(description='ASR training')
-parser.add_argument('--config', metavar='DIR',
-                    help='Path to train config file', default='config/train.yaml')
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    with open(args.config, 'r') as file:
-        config = yaml.load(file)
-    config_dict: Dict = config["train"]
-    model = ModelWrapper(**config_dict)
-    model.train()
diff --git a/utils.py b/utils.py
deleted file mode 100644
index 5e219d1..0000000
--- a/utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch
-from apex.fp16_utils import BN_convert_float
-import torch.distributed as dist
-
-from models.deepspeech2 import DeepSpeech2
-
-
-def convert_model_to_half(model):
-    """
-    Converts model to half but keeps the batch norm layers in 32 bit for precision purposes
-    """
-    old_model = model
-    new_model = BN_convert_float(model.half())
-    del old_model  # Delete previous non-half model
-    return new_model
-
-
-def reduce_tensor(tensor, world_size, reduce_op_max=False):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.reduce_op.MAX if reduce_op_max is True else dist.reduce_op.SUM)  # Default to sum
-    if not reduce_op_max:
-        rt /= world_size
-    return rt
-
-
-def check_loss(loss, loss_value):
-    """
-    Check that warp-ctc loss is valid and will not break training
-    :return: Return if loss is valid, and the error in case it is not
-    """
-    loss_valid = True
-    error = ''
-    if loss_value == float("inf") or loss_value == float("-inf"):
-        loss_valid = False
-        error = "WARNING: received an inf loss"
-    elif torch.isnan(loss).sum() > 0:
-        loss_valid = False
-        error = 'WARNING: received a nan loss, setting loss value to 0'
-    elif loss_value < 0:
-        loss_valid = False
-        error = "WARNING: received a negative loss"
-    return loss_valid, error
-
-
-def load_model(device, model_path, is_cuda):
-    model = DeepSpeech.load_model(model_path)
-    model.eval()
-    model = model.to(device)
-    if is_cuda and model.mixed_precision:
-        model = convert_model_to_half(model)
-    return model

From 70d4fc3396edbfd901a64fc65487db555a377a19 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sat, 25 May 2019 12:31:22 +0200
Subject: [PATCH 03/58] restructure code

---
 requirements.txt                   |  24 ++
 setup.py                           |   9 +
 sonosco/STT_srv.py                 |  43 +++
 sonosco/config/infer.yaml          |  21 ++
 sonosco/config/test.yaml           |   9 +
 sonosco/config/train.yaml          |  49 ++++
 sonosco/decoders/__init__.py       |   0
 sonosco/decoders/beam_decoder.py   |  75 ++++++
 sonosco/decoders/decoder.py        |  85 ++++++
 sonosco/decoders/greedy_decoder.py |  74 ++++++
 sonosco/infer.py                   |  24 ++
 sonosco/loader.py                  | 248 +++++++++++++++++
 sonosco/models/__init__.py         |   0
 sonosco/models/deepspeech2.py      | 269 +++++++++++++++++++
 sonosco/modelwrapper.py            | 413 +++++++++++++++++++++++++++++
 sonosco/test.py                    |  18 ++
 sonosco/train.py                   |  18 ++
 sonosco/utils.py                   |  51 ++++
 18 files changed, 1430 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 setup.py
 create mode 100644 sonosco/STT_srv.py
 create mode 100644 sonosco/config/infer.yaml
 create mode 100644 sonosco/config/test.yaml
 create mode 100644 sonosco/config/train.yaml
 create mode 100644 sonosco/decoders/__init__.py
 create mode 100644 sonosco/decoders/beam_decoder.py
 create mode 100644 sonosco/decoders/decoder.py
 create mode 100644 sonosco/decoders/greedy_decoder.py
 create mode 100644 sonosco/infer.py
 create mode 100644 sonosco/loader.py
 create mode 100644 sonosco/models/__init__.py
 create mode 100644 sonosco/models/deepspeech2.py
 create mode 100644 sonosco/modelwrapper.py
 create mode 100644 sonosco/test.py
 create mode 100644 sonosco/train.py
 create mode 100644 sonosco/utils.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c7d87ee
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,24 @@
+apex==0.1
+audioread==2.1.7
+cycler==0.10.0
+decorator==4.4.0
+joblib==0.13.2
+kiwisolver==1.1.0
+librosa==0.6.3
+llvmlite==0.28.0
+matplotlib==3.1.0
+numba==0.43.1
+numpy==1.16.3
+Pillow==6.0.0
+pyparsing==2.4.0
+python-dateutil==2.8.0
+resampy==0.2.1
+scikit-learn==0.21.2
+scipy==1.3.0
+six==1.12.0
+torch==1.1.0
+torchaudio==0.2
+torchvision==0.3.0
+tqdm==4.32.1
+pyyaml==5.1
+-e git+git@github.com:SeanNaren/warp-ctc.git@ab27454d0cf3f936a6e19165682fe2b759f3f8e5#egg=warpctc_pytorch&subdirectory=pytorch_binding
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..bdb1387
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,9 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="sonosco",
+    description="Framework for training automatic speech recognition systems.",
+    author="The Roboy Gang",
+    packages=["sonosco"],
+    include_package_data=True,
+)
diff --git a/sonosco/STT_srv.py b/sonosco/STT_srv.py
new file mode 100644
index 0000000..7013756
--- /dev/null
+++ b/sonosco/STT_srv.py
@@ -0,0 +1,43 @@
+import os
+
+from roboy_cognition_msgs.msg import RecognizedSpeech
+from roboy_cognition_msgs.srv import RecognizeSpeech
+
+from asr_interface import IAsr
+import rclpy
+from rclpy.node import Node
+
+
+class SonoscoROS2(Node):
+    def __init__(self):
+        super().__init__('stt')
+        self.publisher = self.create_publisher(RecognizedSpeech, '/roboy/cognition/speech/recognition')
+        self.srv = self.create_service(RecognizeSpeech, '/roboy/cognition/speech/recognition/recognize', self.asr_callback)
+        print("Ready to /roboy/cognition/speech/recognition/recognize")
+        print(f"Roboy Sonosco running with PID: {os.getpid()}")
+        self.i=IAsr()
+        print(f"Status: Speech recognition is ready now!")
+        print("Roboy Sonosco is ready!")
+
+    def asr_callback(self, request, response):
+        response.success = True
+        self.get_logger().info('Incoming Audio')
+        msg = RecognizedSpeech()
+        self.i.inference_audio(request)
+        self.publisher.publish(msg)
+        return response
+
+
+def main(args=None):
+    rclpy.init(args=args)
+
+    stt = SonoscoROS2()
+
+    while rclpy.ok():
+        rclpy.spin_once(stt)
+
+    rclpy.shutdown()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sonosco/config/infer.yaml b/sonosco/config/infer.yaml
new file mode 100644
index 0000000..be61617
--- /dev/null
+++ b/sonosco/config/infer.yaml
@@ -0,0 +1,21 @@
+infer:
+  model_name: ""
+  audio_path: "" # Audio file to predict on
+
+  sample_rate: 16000 # Sample rate
+  window_size: 0.02 # Window size for spectrogram in seconds
+  window_stride: 0.01 # Window stride for spectrogram in seconds
+  window: 'hamming' # Window type for spectrogram generation
+
+  beam_decoder: False # Turn on beam decoder. otherwise - greedy
+  alpha: 0.8
+  beam_width: 10
+  beta: 1
+  cutoff_prob: 1.0
+  cutoff_top_n: 40
+  lm_path: None # Path to a KenLM binary
+  lm_workers: 1
+  offsets: False # Returns time offset information
+  top_paths: 1
+
+  cuda: True # Use cuda to run model
\ No newline at end of file
diff --git a/sonosco/config/test.yaml b/sonosco/config/test.yaml
new file mode 100644
index 0000000..2589e15
--- /dev/null
+++ b/sonosco/config/test.yaml
@@ -0,0 +1,9 @@
+test:
+  test_manifest: "" # Path to test manifest csv
+
+  batch_size: 32 # Batch size for testing
+  num_workers: 4 # Number of workers used in loading
+  verbose: True # Print out decoded output and error of each sample
+  save_output: Trur # Saves output of model from test
+  output_path: "" # Where to save raw acoustic output
+
diff --git a/sonosco/config/train.yaml b/sonosco/config/train.yaml
new file mode 100644
index 0000000..25be72c
--- /dev/null
+++ b/sonosco/config/train.yaml
@@ -0,0 +1,49 @@
+train:
+  train_manifest: 'examples/manifests/train_manifest.csv'
+  val_manifest: 'examples/manifests/val_manifest.csv'
+  labels_path: 'examples/labels.json' # Contains all characters for transcription
+  log_dir: 'logs' # Location for log files
+  def_dir: 'examples/checkpoints/', # Default location to save/load models
+
+  load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune
+
+  sample_rate: 16000 # Sample rate
+  window_size: 0.02 # Window size for spectrogram in seconds
+  window_stride: 0.01 # Window stride for spectrogram in seconds
+  window: 'hamming' # Window type for spectrogram generation
+
+  batch_size: 32 # Batch size for training
+  hidden_size: 800 # Hidden size of RNNs
+  hidden_layers: 5 # Number of RNN layers
+  rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported
+
+  max_epochs: 70 # Number of training epochs
+  learning_rate: 3e-4 # Initial learning rate
+  momentum: 0.9 # Momentum
+  max_norm: 800 # Norm cutoff to prevent explosion of gradients
+  learning_anneal: 1.1n # Annealing applied to learning rate every epoch
+  sortaGrad: True # Turn on ordering of dataset on sequence length for the first epoch
+
+  checkpoint: True # Enables checkpoint saving of model
+  checkpoint_per_epoch: 1 # Save checkpoint per x epochs
+  silent: False # Turn on progress tracking per iteration
+  verbose: False # Turn on verbose progress tracking
+  continue: False # Continue training with a pre-trained model
+  finetune: False # Finetune a pre-trained model
+
+  num_data_workers: 8 # Number of workers used in data-loading
+  augment: False # Use random tempo and gain perturbations
+  shuffle: True # Turn on shuffling and sample from dataset based on sequence length (smallest to largest)
+
+  seed: 123456 # Seed to generators
+  cuda: True # Use cuda to train model
+  half_precision: Trues # Uses half precision to train a model
+  apex: True # Uses mixed precision to train a model
+  static_loss_scaling: False # Static loss scale for mixed precision
+  dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision
+
+  dist_url: 'tcp://127.0.0.1:1550' # URL used to set up distributed training
+  dist_backend: 'nccl' # Distributed backend
+  world_size: 1 # Number of distributed processes
+  rank: 0 # The rank of the current process
+  gpu_rank: 0 # If using distributed parallel for multi_gpu, sets the GPU for the process
\ No newline at end of file
diff --git a/sonosco/decoders/__init__.py b/sonosco/decoders/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/decoders/beam_decoder.py b/sonosco/decoders/beam_decoder.py
new file mode 100644
index 0000000..c44d164
--- /dev/null
+++ b/sonosco/decoders/beam_decoder.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# ----------------------------------------------------------------------------
+# Copyright 2015-2016 Nervana Systems Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+# Modified to support pytorch Tensors
+import torch
+
+from decoders.decoder import Decoder
+
+
+class BeamCTCDecoder(Decoder):
+    def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100,
+                 num_processes=4, blank_index=0):
+        super(BeamCTCDecoder, self).__init__(labels)
+        try:
+            from ctcdecode import CTCBeamDecoder
+        except ImportError:
+            raise ImportError("BeamCTCDecoder requires paddledecoder package.")
+        self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width,
+                                       num_processes, blank_index)
+
+    def convert_to_strings(self, out, seq_len):
+        results = []
+        for b, batch in enumerate(out):
+            utterances = []
+            for p, utt in enumerate(batch):
+                size = seq_len[b][p]
+                if size > 0:
+                    transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size]))
+                else:
+                    transcript = ''
+                utterances.append(transcript)
+            results.append(utterances)
+        return results
+
+    def convert_tensor(self, offsets, sizes):
+        results = []
+        for b, batch in enumerate(offsets):
+            utterances = []
+            for p, utt in enumerate(batch):
+                size = sizes[b][p]
+                if sizes[b][p] > 0:
+                    utterances.append(utt[0:size])
+                else:
+                    utterances.append(torch.tensor([], dtype=torch.int))
+            results.append(utterances)
+        return results
+
+    def decode(self, probs, sizes=None):
+        """
+        Decodes probability output using ctcdecode package.
+        Arguments:
+            probs: Tensor of character probabilities, where probs[c,t]
+                            is the probability of character c at time t
+            sizes: Size of each sequence in the mini-batch
+        Returns:
+            string: sequences of the model's best guess for the transcription
+        """
+        probs = probs.cpu()
+        out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes)
+
+        strings = self.convert_to_strings(out, seq_lens)
+        offsets = self.convert_tensor(offsets, seq_lens)
+        return strings, offsets
\ No newline at end of file
diff --git a/sonosco/decoders/decoder.py b/sonosco/decoders/decoder.py
new file mode 100644
index 0000000..99a8193
--- /dev/null
+++ b/sonosco/decoders/decoder.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# ----------------------------------------------------------------------------
+# Copyright 2015-2016 Nervana Systems Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+# Modified to support pytorch Tensors
+
+import Levenshtein as Lev
+
+
+class Decoder(object):
+    """
+    Basic decoder class from which all other decoders inherit. Implements several
+    helper functions. Subclasses should implement the decode() method.
+
+    Arguments:
+        labels (string): mapping from integers to characters.
+        blank_index (int, optional): index for the blank '_' character. Defaults to 0.
+        space_index (int, optional): index for the space ' ' character. Defaults to 28.
+    """
+
+    def __init__(self, labels, blank_index=0):
+        # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
+        self.labels = labels
+        self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
+        self.blank_index = blank_index
+        space_index = len(labels)  # To prevent errors in decode, we add an out of bounds index for the space
+        if ' ' in labels:
+            space_index = labels.index(' ')
+        self.space_index = space_index
+
+    def wer(self, s1, s2):
+        """
+        Computes the Word Error Rate, defined as the edit distance between the
+        two provided sentences after tokenizing to words.
+        Arguments:
+            s1 (string): space-separated sentence
+            s2 (string): space-separated sentence
+        """
+
+        # build mapping of words to integers
+        b = set(s1.split() + s2.split())
+        word2char = dict(zip(b, range(len(b))))
+
+        # map the words to a char array (Levenshtein packages only accepts
+        # strings)
+        w1 = [chr(word2char[w]) for w in s1.split()]
+        w2 = [chr(word2char[w]) for w in s2.split()]
+
+        return Lev.distance(''.join(w1), ''.join(w2))
+
+    def cer(self, s1, s2):
+        """
+        Computes the Character Error Rate, defined as the edit distance.
+
+        Arguments:
+            s1 (string): space-separated sentence
+            s2 (string): space-separated sentence
+        """
+        s1, s2, = s1.replace(' ', ''), s2.replace(' ', '')
+        return Lev.distance(s1, s2)
+
+    def decode(self, probs, sizes=None):
+        """
+        Given a matrix of character probabilities, returns the decoder's
+        best guess of the transcription
+
+        Arguments:
+            probs: Tensor of character probabilities, where probs[c,t]
+                            is the probability of character c at time t
+            sizes(optional): Size of each sequence in the mini-batch
+        Returns:
+            string: sequence of the model's best guess for the transcription
+        """
+        raise NotImplementedError
diff --git a/sonosco/decoders/greedy_decoder.py b/sonosco/decoders/greedy_decoder.py
new file mode 100644
index 0000000..c14884f
--- /dev/null
+++ b/sonosco/decoders/greedy_decoder.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# ----------------------------------------------------------------------------
+# Copyright 2015-2016 Nervana Systems Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+# Modified to support pytorch Tensors
+
+import torch
+
+from decoders.decoder import Decoder
+
+
+class GreedyDecoder(Decoder):
+    def __init__(self, labels, blank_index=0):
+        super(GreedyDecoder, self).__init__(labels, blank_index)
+
+    def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False):
+        """Given a list of numeric sequences, returns the corresponding strings"""
+        strings = []
+        offsets = [] if return_offsets else None
+        for x in range(len(sequences)):
+            seq_len = sizes[x] if sizes is not None else len(sequences[x])
+            string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions)
+            strings.append([string])  # We only return one path
+            if return_offsets:
+                offsets.append([string_offsets])
+        if return_offsets:
+            return strings, offsets
+        else:
+            return strings
+
+    def process_string(self, sequence, size, remove_repetitions=False):
+        string = ''
+        offsets = []
+        for i in range(size):
+            char = self.int_to_char[sequence[i].item()]
+            if char != self.int_to_char[self.blank_index]:
+                # if this char is a repetition and remove_repetitions=true, then skip
+                if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]:
+                    pass
+                elif char == self.labels[self.space_index]:
+                    string += ' '
+                    offsets.append(i)
+                else:
+                    string = string + char
+                    offsets.append(i)
+        return string, torch.tensor(offsets, dtype=torch.int)
+
+    def decode(self, probs, sizes=None):
+        """
+        Returns the argmax decoding given the probability matrix. Removes
+        repeated elements in the sequence, as well as blanks.
+
+        Arguments:
+            probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim
+            sizes(optional): Size of each sequence in the mini-batch
+        Returns:
+            strings: sequences of the model's best guess for the transcription on inputs
+            offsets: time step per character predicted
+        """
+        _, max_probs = torch.max(probs, 2)
+        strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes,
+                                                   remove_repetitions=True, return_offsets=True)
+        return strings, offsets
diff --git a/sonosco/infer.py b/sonosco/infer.py
new file mode 100644
index 0000000..8fe03d2
--- /dev/null
+++ b/sonosco/infer.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+import wave
+from typing import Dict
+
+import yaml
+
+from modelwrapper import ModelWrapper
+
+parser = argparse.ArgumentParser(description='ASR inference')
+parser.add_argument('--config', metavar='DIR',
+                    help='Path to inference config file', default='config/infer.yaml')
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    with open(args.config, 'r') as file:
+        config = yaml.load(file)
+    config_dict: Dict = config["infer"]
+    model = ModelWrapper(**config_dict)
+    if "wave_path" in config_dict.keys() and os.path.isfile(config_dict.get("wave_path")):
+        sound = wave.open(config_dict.get("wave_path"))
+        print(model.infer(sound))
+    else:
+        print("Wave file not found!")
diff --git a/sonosco/loader.py b/sonosco/loader.py
new file mode 100644
index 0000000..00b06d1
--- /dev/null
+++ b/sonosco/loader.py
@@ -0,0 +1,248 @@
+# ----------------------------------------------------------------------------
+# Based on SeanNaren's deepspeech.pytorch:
+# https://github.com/SeanNaren/deepspeech.pytorch
+# ----------------------------------------------------------------------------
+
+import math
+import warnings
+from typing import Tuple
+
+import librosa
+import numpy as np
+import torch
+import torchaudio
+from scipy import signal
+from torch.utils.data import Dataset, DataLoader, Sampler
+from torch.distributed import get_rank
+from torch.distributed import get_world_size
+
+windows = {"bartlett": torch.bartlett_window,
+           "blackman": torch.blackman_window,
+           "hamming": torch.hamming_window,
+           "hann": torch.hann_window}
+
+windows_legacy = {'hamming': signal.hamming,
+                  'hann': signal.hann,
+                  'blackman': signal.blackman,
+                  'bartlett': signal.bartlett}
+
+
+class DataProcessor(object):
+    def __init__(self, audio_conf, labels="abc", normalize=False, augment=False, legacy=True):
+        """
+        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
+        a comma. Each new line is a different sample. Example below:
+
+        /path/to/audio.wav,/path/to/audio.txt
+        ...
+
+        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
+        :param labels: String containing all the possible characters to map to
+        :param normalize: Apply standard mean and deviation normalization to audio tensor
+        :param augment(default False):  Apply random tempo and gain perturbations
+        """
+        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
+        self.window_stride = audio_conf["window_stride"]
+        self.window_size = audio_conf["window_size"]
+        self.sample_rate = audio_conf["sample_rate"]
+        self.window = windows_legacy.get(audio_conf["window"], windows_legacy["hamming"]) if legacy else windows.get(audio_conf["window"], windows["hamming"])
+        self.normalize = normalize
+        self.augment = augment
+        self.legacy = legacy
+        self.transform = torchaudio.transforms.Spectrogram(n_fft=int(self.sample_rate * self.window_size),
+                                                           hop=int(self.sample_rate * self.window_stride),
+                                                           window=self.window, normalize=self.normalize)
+
+    @staticmethod
+    def retrieve_file(audio_path, legacy=True):
+        sound, sample_rate = torchaudio.load(audio_path)
+        if legacy:
+            sound = sound.numpy().T
+            if len(sound.shape) > 1:
+                if sound.shape[1] == 1:
+                    sound = sound.squeeze()
+                else:
+                    sound = sound.mean(axis=1)
+        return sound, sample_rate
+
+    @staticmethod
+    def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)):
+        """
+        Changes tempo and gain of the wave
+        """
+        warnings.warn("Augmentation is not implemented")  # TODO: Implement
+        return sound
+
+    def parse_audio(self, audio_path):
+        sound, sample_rate = self.retrieve_file(audio_path, self.legacy)
+        if sample_rate != self.sample_rate:
+            raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
+
+        if self.augment:
+            sound = self.augment_audio(sound)
+
+        if self.legacy:
+            n_fft = int(self.sample_rate * self.window_size)
+            win_length = n_fft
+            hop_length = int(self.sample_rate * self.window_stride)
+            # STFT
+            D = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
+                             win_length=win_length, window=self.window)
+            spectrogram, phase = librosa.magphase(D)
+            # S = log(S+1)
+
+            spectrogram = torch.FloatTensor(np.log1p(spectrogram))
+        else:
+            # TODO: Why these are different from librosa.stft?
+            sound = sound.cuda()
+            spectrogram = self.transform(sound)[-1, :, :].transpose(0, 1)
+
+            # spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()),
+            #                          n_fft=int(self.sample_rate * self.window_size),
+            #                          hop_length=int(self.sample_rate * self.window_stride),
+            #                          win_length=int(self.sample_rate * self.window_size),
+            #                          window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1]
+
+        if self.normalize:
+            mean = spectrogram.mean()
+            std = spectrogram.std()
+            spectrogram.add_(-mean)
+            spectrogram.div_(std)
+
+        return spectrogram
+
+    def parse_transcript(self, transcript_path):
+        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
+            transcript = transcript_file.read().replace('\n', '')
+        # TODO: Is it fast enough?
+        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
+        return transcript
+
+
+class AudioDataset(Dataset):
+    def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False, legacy=True):
+        """
+        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
+        a comma. Each new line is a different sample. Example below:
+
+        /path/to/audio.wav,/path/to/audio.txt
+        ...
+
+        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
+        :param manifest_filepath: Path to manifest csv as describe above
+        :param labels: String containing all the possible characters to map to
+        :param normalize: Apply standard mean and deviation normalization to audio tensor
+        :param augment(default False):  Apply random tempo and gain perturbations
+        """
+        super(AudioDataset, self).__init__()
+        with open(manifest_filepath) as f:
+            ids = f.readlines()
+        ids = [x.strip().split(',') for x in ids]
+        self.ids = ids
+        self.size = len(ids)
+        self.processor = DataProcessor(audio_conf, labels, normalize, augment, legacy)
+
+    def __getitem__(self, index):
+        sample = self.ids[index]
+        audio_path, transcript_path = sample[0], sample[1]
+
+        spectrogram = self.processor.parse_audio(audio_path)
+        transcript = self.processor.parse_transcript(transcript_path)
+
+        return spectrogram, transcript
+
+    def __len__(self):
+        return self.size
+
+
+# TODO: Optimise
+def _collate_fn(batch):
+    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
+    longest_sample = batch[0][0]
+    freq_size, max_seqlength = longest_sample.size()
+    minibatch_size = len(batch)
+    inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
+    input_percentages = torch.FloatTensor(minibatch_size)
+    target_sizes = np.zeros(minibatch_size, dtype=np.int32)
+
+    # TODO: Numpy broadcasting magic
+    targets = []
+
+    for x in range(minibatch_size):
+        inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
+        input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
+        target_sizes[x] = len(batch[x][1])
+        targets.extend(batch[x][1])
+
+    return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
+
+
+class AudioDataLoader(DataLoader):
+    def __init__(self, *args, **kwargs):
+        """
+        Creates a data loader for AudioDatasets.
+        """
+        super(AudioDataLoader, self).__init__(*args, **kwargs)
+        self.collate_fn = _collate_fn
+
+
+class BucketingSampler(Sampler):
+    def __init__(self, data_source, batch_size=1):
+        """
+        Samples batches assuming they are in order of size to batch similarly sized samples together.
+        """
+        super(BucketingSampler, self).__init__(data_source)
+        self.data_source = data_source
+        ids = list(range(0, len(data_source)))
+        # TODO: Optimise
+        self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]
+
+    def __iter__(self):
+        for ids in self.bins:
+            np.random.shuffle(ids)
+            yield ids
+
+    def __len__(self):
+        return len(self.bins)
+
+    def shuffle(self, epoch):
+        np.random.shuffle(self.bins)
+
+
+# TODO: Optimise
+class DistributedBucketingSampler(Sampler):
+    def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
+        """
+        Samples batches assuming they are in order of size to batch similarly sized samples together.
+        """
+        super(DistributedBucketingSampler, self).__init__(data_source)
+        if num_replicas is None:
+            num_replicas = get_world_size()
+        if rank is None:
+            rank = get_rank()
+        self.data_source = data_source
+        self.ids = list(range(0, len(data_source)))
+        self.batch_size = batch_size
+        self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        offset = self.rank
+        # add extra samples to make it evenly divisible
+        bins = self.bins + self.bins[:(self.total_size - len(self.bins))]
+        assert len(bins) == self.total_size
+        samples = bins[offset::self.num_replicas]  # Get every Nth bin, starting from rank
+        return iter(samples)
+
+    def __len__(self):
+        return self.num_samples
+
+    def shuffle(self, epoch):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(epoch)
+        bin_ids = list(torch.randperm(len(self.bins), generator=g))
+        self.bins = [self.bins[i] for i in bin_ids]
diff --git a/sonosco/models/__init__.py b/sonosco/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py
new file mode 100644
index 0000000..dcfa100
--- /dev/null
+++ b/sonosco/models/deepspeech2.py
@@ -0,0 +1,269 @@
+# ----------------------------------------------------------------------------
+# Based on SeanNaren's deepspeech.pytorch:
+# https://github.com/SeanNaren/deepspeech.pytorch
+# ----------------------------------------------------------------------------
+
+import math
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+supported_rnns = {
+    'lstm': nn.LSTM,
+    'rnn': nn.RNN,
+    'gru': nn.GRU
+}
+supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())
+
+
+class SequenceWise(nn.Module):
+    def __init__(self, module):
+        """
+        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
+        Allows handling of variable sequence lengths and minibatch sizes.
+        :param module: Module to apply input to.
+        """
+        super(SequenceWise, self).__init__()
+        self.module = module
+
+    def forward(self, x):
+        t, n = x.size(0), x.size(1)
+        x = x.view(t * n, -1)
+        x = self.module(x)
+        x = x.view(t, n, -1)
+        return x
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + ' (\n'
+        tmpstr += self.module.__repr__()
+        tmpstr += ')'
+        return tmpstr
+
+
+class MaskConv(nn.Module):
+    def __init__(self, seq_module):
+        """
+        Adds padding to the output of the module based on the given lengths. This is to ensure that the
+        results of the model do not change when batch sizes change during inference.
+        Input needs to be in the shape of (BxCxDxT)
+        :param seq_module: The sequential module containing the conv stack.
+        """
+        super(MaskConv, self).__init__()
+        self.seq_module = seq_module
+
+    def forward(self, x, lengths):
+        """
+        :param x: The input of size BxCxDxT
+        :param lengths: The actual length of each sequence in the batch
+        :return: Masked output from the module
+        """
+        for module in self.seq_module:
+            x = module(x)
+            mask = torch.ByteTensor(x.size()).fill_(0)
+            if x.is_cuda:
+                mask = mask.cuda()
+            for i, length in enumerate(lengths):
+                length = length.item()
+                if (mask[i].size(2) - length) > 0:
+                    mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
+            x = x.masked_fill(mask, 0)
+        return x, lengths
+
+
+class InferenceBatchSoftmax(nn.Module):
+    def forward(self, input_):
+        if not self.training:
+            return F.softmax(input_, dim=-1)
+        else:
+            return input_
+
+
+class BatchRNN(nn.Module):
+    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True):
+        super(BatchRNN, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
+        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
+                            bidirectional=True, bias=True)
+
+    def flatten_parameters(self):
+        self.rnn.flatten_parameters()
+
+    def forward(self, x, output_lengths):
+        if self.batch_norm is not None:
+            x = self.batch_norm(x)
+        x = nn.utils.rnn.pack_padded_sequence(x, output_lengths)
+        x, h = self.rnn(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(x)
+        if self.bidirectional:
+            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
+        return x
+
+
+class DeepSpeech2(nn.Module):
+    def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5, audio_conf=None,
+                 bidirectional=True):
+        super(DeepSpeech2, self).__init__()
+
+        # model metadata needed for serialization/deserialization
+        if audio_conf is None:
+            audio_conf = {}
+        self.version = '0.0.1'
+        self.hidden_size = rnn_hid_size
+        self.hidden_layers = nb_layers
+        self.rnn_type = rnn_type
+        self.audio_conf = audio_conf or {}
+        self.labels = labels
+        self.bidirectional = bidirectional
+        # self.mixed_precision = mixed_precision
+
+        sample_rate = self.audio_conf.get("sample_rate", 16000)
+        window_size = self.audio_conf.get("window_size", 0.02)
+        num_classes = len(self.labels)
+
+        self.conv = MaskConv(nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
+            nn.BatchNorm2d(32),
+            nn.Hardtanh(0, 20, inplace=True),
+            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
+            nn.BatchNorm2d(32),
+            nn.Hardtanh(0, 20, inplace=True)
+        ))
+        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
+        rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1)
+        rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1)
+        rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1)
+        rnn_in_size *= 32
+
+        rnns = [('0', BatchRNN(input_size=rnn_in_size, hidden_size=rnn_hid_size, rnn_type=rnn_type, batch_norm=False))]
+        rnns.extend([(f"{x + 1}", BatchRNN(input_size=rnn_hid_size, hidden_size=rnn_hid_size, rnn_type=rnn_type))
+                     for x in range(nb_layers - 1)])
+        self.rnns = nn.Sequential(OrderedDict(rnns))
+
+        fully_connected = nn.Sequential(
+            nn.BatchNorm1d(rnn_hid_size),
+            nn.Linear(rnn_hid_size, num_classes, bias=False)
+        )
+
+        self.fc = nn.Sequential(
+            SequenceWise(fully_connected),
+        )
+
+        self.inference_softmax = InferenceBatchSoftmax()
+
+    def forward(self, x, lengths):
+        # if x.is_cuda and self.mixed_precision:
+        #     x = x.half()
+        lengths = lengths.cpu().int()
+        output_lengths = self.get_seq_lens(lengths)
+        x, _ = self.conv(x, output_lengths)
+
+        sizes = x.size()
+        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
+        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
+
+        for rnn in self.rnns:
+            x = rnn(x, output_lengths)
+
+        if not self.bidirectional:  # no need for lookahead layer in bidirectional
+            x = self.lookahead(x)
+
+        x = self.fc(x)
+        x = x.transpose(0, 1)
+        # identity in training mode, softmax in eval mode
+        x = self.inference_softmax(x)
+        return x, output_lengths
+
+    def get_seq_lens(self, input_length):
+        """
+        Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
+        containing the size sequences that will be output by the network.
+        :param input_length: 1D Tensor
+        :return: 1D Tensor scaled by model
+        """
+        seq_len = input_length
+        for m in self.conv.modules():
+            if type(m) == nn.modules.conv.Conv2d:
+                seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
+        return seq_len.int()
+
+    @classmethod
+    def load_model(cls, path):
+        package = torch.load(path, map_location=lambda storage, loc: storage)
+        model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'],
+                    labels=package['labels'], audio_conf=package['audio_conf'],
+                    rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))
+        model.load_state_dict(package['state_dict'])
+        for x in model.rnns:
+            x.flatten_parameters()
+
+        return model
+
+    @classmethod
+    def load_model_package(cls, package):
+        model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'],
+                    labels=package['labels'], audio_conf=package['audio_conf'],
+                    rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True))
+        model.load_state_dict(package['state_dict'])
+
+        return model
+
+    @staticmethod
+    def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None,
+                  cer_results=None, wer_results=None, avg_loss=None, meta=None):
+        package = {
+            'version': model.version,
+            'hidden_size': model.hidden_size,
+            'hidden_layers': model.hidden_layers,
+            'rnn_type': supported_rnns_inv.get(model.rnn_type, model.rnn_type.__name__.lower()),
+            'audio_conf': model.audio_conf,
+            'labels': model.labels,
+            'state_dict': model.state_dict(),
+            'bidirectional': model.bidirectional
+        }
+        if optimizer is not None:
+            package['optim_dict'] = optimizer.state_dict()
+        if avg_loss is not None:
+            package['avg_loss'] = avg_loss
+        if epoch is not None:
+            package['epoch'] = epoch + 1  # increment for readability
+        if iteration is not None:
+            package['iteration'] = iteration
+        if loss_results is not None:
+            package['loss_results'] = loss_results
+            package['cer_results'] = cer_results
+            package['wer_results'] = wer_results
+        if meta is not None:
+            package['meta'] = meta
+        return package
+
+    @staticmethod
+    def get_param_size(model):
+        params = 0
+        for p in model.parameters():
+            tmp = 1
+            for x in p.size():
+                tmp *= x
+            params += tmp
+        return params
+
+    def __repr__(self):
+        rep = f"DeepSpeech2 version: {self.version}\n" + \
+               "=======================================" + \
+               "Recurrent Neural Network Properties\n" + \
+               f"  RNN Type:  \t{self.rnn_type.__name__.lower()}\n" + \
+               f"  RNN Layers:\t{self.hidden_layers}\n" + \
+               f"  RNN Size:  \t{self.hidden_size}\n" + \
+               f"  Classes:   \t{len(self.labels)}\n" + \
+               "---------------------------------------\n" + \
+               "Model Features\n" + \
+               f"  Labels:       \t{self.labels}\n" + \
+               f"  Sample Rate:  \t{self.audio_conf.get('sample_rate', 'n/a')}\n" + \
+               f"  Window Type:  \t{self.audio_conf.get('window', 'n/a')}\n" + \
+               f"  Window Size:  \t{self.audio_conf.get('window_size', 'n/a')}\n" + \
+               f"  Window Stride:\t{self.audio_conf.get('window_stride', 'n/a')}"
+        return rep
diff --git a/sonosco/modelwrapper.py b/sonosco/modelwrapper.py
new file mode 100644
index 0000000..b313603
--- /dev/null
+++ b/sonosco/modelwrapper.py
@@ -0,0 +1,413 @@
+import os.path
+from random import random
+from datetime import datetime
+
+import numpy as np
+import torch
+
+from models.deepspeech2 import DeepSpeech2
+
+import json
+import os
+import random
+import time
+
+import torch.distributed as dist
+import torch.utils.data.distributed
+
+try:
+    from apex.fp16_utils import FP16_Optimizer
+    from apex.parallel import DistributedDataParallel
+except Exception as e:
+    print(f"Apex import failed: {e}")
+
+from tqdm import tqdm
+from warpctc_pytorch import CTCLoss
+
+from loader import AudioDataLoader, AudioDataset, BucketingSampler, DistributedBucketingSampler
+from decoders.greedy_decoder import GreedyDecoder
+from utils import convert_model_to_half, reduce_tensor, check_loss
+
+models = {"deepspeech2": DeepSpeech2}
+
+sttime = datetime.now()
+print(f"Time of start: {sttime}")
+
+
+def to_np(x):
+    return x.cpu().numpy()
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+class ModelWrapper(object):
+    DEF_PATH = "examples/checkpoints/"
+
+    def __init__(self, **kwargs):
+        self.model = kwargs.get("model", models["deepspeech2"])
+
+        if kwargs.get("continue"):
+            path = kwargs.get("from", ModelWrapper.get_default_path())
+            self.model.package = torch.load(path, map_location=lambda storage, loc: storage)
+            self.model.load_model(path)
+
+        self.save_path = kwargs.get("save", ModelWrapper.DEF_PATH + str(datetime.now().timestamp()))
+
+        self.cuda = kwargs.get("cuda")
+        self.apex = kwargs.get("apex") if self.cuda else False
+        self.half = self.apex if self.apex else kwargs.get("half")
+
+    def train(self, seed, cuda, mixed_precision, world_size, gpu_rank, rank, save_folder, dist_backend, dist_url,
+              epochs, continue_from, finetune, labels_path, sample_rate, window_size, window_stride, window,
+              hidden_size, hidden_layers, labels, supported_rnns, bidirectional, no_shuffle, no_sorta_grad, rnn_type,
+              train_manifest, augment, batch_size, num_workers, momentum, lr, static_loss_scale, dynamic_loss_scale,
+              val_manifest, max_norm, silent, checkpoint_per_batch, checkpoint, learning_anneal, model_path):
+        # Set seeds for determinism
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+
+        device = torch.device("cuda" if cuda else "cpu")
+        if mixed_precision and not cuda:
+            raise ValueError('If using mixed precision training, CUDA must be enabled!')
+        distributed = world_size > 1
+        main_proc = True
+        device = torch.device("cuda" if cuda else "cpu")
+        if distributed:
+            if gpu_rank:
+                torch.cuda.set_device(int(gpu_rank))
+            dist.init_process_group(backend=dist_backend, init_method=dist_url,
+                                    world_size=world_size, rank=rank)
+            main_proc = rank == 0  # Only the first proc should save models
+        save_folder = save_folder
+        os.makedirs(save_folder, exist_ok=True)  # Ensure save folder exists
+
+        loss_results, cer_results, wer_results = torch.Tensor(epochs), torch.Tensor(epochs), torch.Tensor(epochs)
+        best_wer = None
+
+        avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None
+        if continue_from:  # Starting from previous model
+            print("Loading checkpoint model %s" % continue_from)
+
+            labels = self.model.labels
+            audio_conf = self.model.audio_conf
+            if not finetune:  # Don't want to restart training
+                optim_state = self.model.package['optim_dict']
+                start_epoch = int(self.model.get('epoch', 1)) - 1  # Index start at 0 for training
+                start_iter = self.model.package.get('iteration', None)
+                if start_iter is None:
+                    start_epoch += 1  # We saved model after epoch finished, start at the next epoch.
+                    start_iter = 0
+                else:
+                    start_iter += 1
+                avg_loss = int(self.model.package.get('avg_loss', 0))
+                loss_results, cer_results, wer_results = self.model.package['loss_results'], \
+                                                         self.model.package['cer_results'], \
+                                                         self.model.package['wer_results']
+        else:
+            with open(labels_path) as label_file:
+                labels = str(''.join(json.load(label_file)))
+
+            audio_conf = dict(sample_rate=sample_rate,
+                              window_size=window_size,
+                              window_stride=window_stride,
+                              window=window)
+
+            rnn_type = rnn_type.lower()
+            assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
+            model = self.model(rnn_hidden_size=hidden_size,
+                               nb_layers=hidden_layers,
+                               labels=labels,
+                               rnn_type=supported_rnns[rnn_type],
+                               audio_conf=audio_conf,
+                               bidirectional=bidirectional,
+                               mixed_precision=mixed_precision)
+
+        decoder = GreedyDecoder(labels)
+
+        train_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels,
+                                     normalize=False, augment=augment)
+        test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels,
+                                    normalize=False, augment=False)
+        if not distributed:
+            train_sampler = BucketingSampler(train_dataset, batch_size=batch_size)
+        else:
+            train_sampler = DistributedBucketingSampler(train_dataset, batch_size=batch_size,
+                                                        num_replicas=world_size, rank=rank)
+
+        train_loader = AudioDataLoader(train_dataset, num_workers=num_workers, batch_sampler=train_sampler)
+        test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)
+
+        if (not no_shuffle and start_epoch != 0) or no_sorta_grad:
+            print("Shuffling batches for the following epochs")
+            train_sampler.shuffle(start_epoch)
+
+        model = model.to(device)
+        if mixed_precision:
+            model = convert_model_to_half(model)
+        parameters = model.parameters()
+        optimizer = torch.optim.SGD(parameters, lr=lr,
+                                    momentum=momentum, nesterov=True, weight_decay=1e-5)
+        if distributed:
+            model = DistributedDataParallel(model)
+        if mixed_precision:
+            optimizer = FP16_Optimizer(optimizer,
+                                       static_loss_scale=static_loss_scale,
+                                       dynamic_loss_scale=dynamic_loss_scale)
+        if optim_state is not None:
+            optimizer.load_state_dict(optim_state)
+        print(model)
+        print("Number of parameters: %d" % self.model.get_param_size(model))
+
+        criterion = CTCLoss()
+        batch_time = AverageMeter()
+        data_time = AverageMeter()
+        losses = AverageMeter()
+
+        for epoch in range(start_epoch, epochs):
+            model.train()
+            end = time.time()
+            start_epoch_time = time.time()
+            for i, (data) in enumerate(train_loader, start=start_iter):
+                if i == len(train_sampler):
+                    break
+                inputs, targets, input_percentages, target_sizes = data
+                input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
+                # measure data loading time
+                data_time.update(time.time() - end)
+                inputs = inputs.to(device)
+
+                out, output_sizes = model(inputs, input_sizes)
+                out = out.transpose(0, 1)  # TxNxH
+
+                float_out = out.float()  # ensure float32 for loss
+                loss = criterion(float_out, targets, output_sizes, target_sizes).to(device)
+                loss = loss / inputs.size(0)  # average the loss by minibatch
+
+                if distributed:
+                    loss = loss.to(device)
+                    loss_value = reduce_tensor(loss, world_size).item()
+                else:
+                    loss_value = loss.item()
+
+                # Check to ensure valid loss was calculated
+                valid_loss, error = check_loss(loss, loss_value)
+                if valid_loss:
+                    optimizer.zero_grad()
+                    # compute gradient
+                    if mixed_precision:
+                        optimizer.backward(loss)
+                        optimizer.clip_master_grads(max_norm)
+                    else:
+                        loss.backward()
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+                    optimizer.step()
+                else:
+                    print(error)
+                    print('Skipping grad update')
+                    loss_value = 0
+
+                avg_loss += loss_value
+                losses.update(loss_value, inputs.size(0))
+
+                # measure elapsed time
+                batch_time.update(time.time() - end)
+                end = time.time()
+                if not silent:
+                    print('Epoch: [{0}][{1}/{2}]\t'
+                          'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                          'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
+                          'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
+                        (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=data_time,
+                        loss=losses))
+                if checkpoint_per_batch > 0 and i > 0 and (i + 1) % checkpoint_per_batch == 0 and main_proc:
+                    file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (save_folder, epoch + 1, i + 1)
+                    print("Saving checkpoint model to %s" % file_path)
+                    torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i,
+                                                    loss_results=loss_results,
+                                                    wer_results=wer_results, cer_results=cer_results,
+                                                    avg_loss=avg_loss),
+                               file_path)
+                del loss, out, float_out
+
+            avg_loss /= len(train_sampler)
+
+            epoch_time = time.time() - start_epoch_time
+            print(f"Elapsed time from start: {datetime.now() - sttime}")
+            print('Training Summary Epoch: [{0}]\t'
+                  'Time taken (s): {epoch_time:.0f}\t'
+                  'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss))
+
+            start_iter = 0  # Reset start iteration for next epoch
+            total_cer, total_wer = 0, 0
+            model.eval()
+            with torch.no_grad():
+                for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
+                    inputs, targets, input_percentages, target_sizes = data
+                    input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
+                    inputs = inputs.to(device)
+
+                    # unflatten targets
+                    split_targets = []
+                    offset = 0
+                    for size in target_sizes:
+                        split_targets.append(targets[offset:offset + size])
+                        offset += size
+
+                    out, output_sizes = model(inputs, input_sizes)
+
+                    decoded_output, _ = decoder.decode(out, output_sizes)
+                    target_strings = decoder.convert_to_strings(split_targets)
+                    wer, cer = 0, 0
+                    for x in range(len(target_strings)):
+                        transcript, reference = decoded_output[x][0], target_strings[x][0]
+                        wer += decoder.wer(transcript, reference) / float(len(reference.split()))
+                        cer += decoder.cer(transcript, reference) / float(len(reference))
+                    total_cer += cer
+                    total_wer += wer
+                    del out
+                wer = total_wer / len(test_loader.dataset)
+                cer = total_cer / len(test_loader.dataset)
+                wer *= 100
+                cer *= 100
+                loss_results[epoch] = avg_loss
+                wer_results[epoch] = wer
+                cer_results[epoch] = cer
+                print('Validation Summary Epoch: [{0}]\t'
+                      'Average WER {wer:.3f}\t'
+                      'Average CER {cer:.3f}\t'.format(
+                    epoch + 1, wer=wer, cer=cer))
+
+            values = {
+                'loss_results': loss_results,
+                'cer_results': cer_results,
+                'wer_results': wer_results
+            }
+
+            if main_proc and checkpoint:
+                file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
+                torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
+                                                wer_results=wer_results, cer_results=cer_results),
+                           file_path)
+            # anneal lr
+            param_groups = optimizer.optimizer.param_groups if mixed_precision else optimizer.param_groups
+            for g in param_groups:
+                g['lr'] = g['lr'] / learning_anneal
+            print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))
+
+            if main_proc and (best_wer is None or best_wer > wer):
+                print("Found better validated model, saving to %s" % model_path)
+                torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
+                                                wer_results=wer_results, cer_results=cer_results), model_path)
+                best_wer = wer
+
+                avg_loss = 0
+                if not no_shuffle:
+                    print("Shuffling batches...")
+                    train_sampler.shuffle(epoch)
+
+    def validate(self):
+        pass
+
+    def test(self):
+        torch.set_grad_enabled(False)
+        device = torch.device("cuda" if cuda else "cpu")
+        model = load_model(device, model_path, cuda)
+
+        if decoder == "beam":
+            from decoder import BeamCTCDecoder
+
+            decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, alpha=alpha, beta=beta,
+                                     cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob,
+                                     beam_width=beam_width, num_processes=lm_workers)
+        elif decoder == "greedy":
+            decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))
+        else:
+            decoder = None
+        target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))
+        test_dataset = AudioDataset(audio_conf=model.audio_conf, manifest_filepath=test_manifest,
+                                    labels=model.labels, normalize=True)
+        test_loader = AudioDataLoader(test_dataset, batch_size=batch_size,
+                                      num_workers=num_workers)
+        total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
+        output_data = []
+        for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
+            inputs, targets, input_percentages, target_sizes = data
+            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
+            inputs = inputs.to(device)
+            # unflatten targets
+            split_targets = []
+            offset = 0
+            for size in target_sizes:
+                split_targets.append(targets[offset:offset + size])
+                offset += size
+
+            out, output_sizes = model(inputs, input_sizes)
+
+            if save_output:
+                # add output to data array, and continue
+                output_data.append((out.cpu().numpy(), output_sizes.numpy()))
+
+            decoded_output, _ = decoder.decode(out, output_sizes)
+            target_strings = target_decoder.convert_to_strings(split_targets)
+            for x in range(len(target_strings)):
+                transcript, reference = decoded_output[x][0], target_strings[x][0]
+                wer_inst = decoder.wer(transcript, reference)
+                cer_inst = decoder.cer(transcript, reference)
+                total_wer += wer_inst
+                total_cer += cer_inst
+                num_tokens += len(reference.split())
+                num_chars += len(reference)
+                if verbose:
+                    print("Ref:", reference.lower())
+                    print("Hyp:", transcript.lower())
+                    print("WER:", float(wer_inst) / len(reference.split()), "CER:", float(cer_inst) / len(reference),
+                          "\n")
+
+        wer = float(total_wer) / num_tokens
+        cer = float(total_cer) / num_chars
+
+        print('Test Summary \t'
+              'Average WER {wer:.3f}\t'
+              'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100))
+        if save_output:
+            np.save(output_path, output_data)
+
+    def infer(self, sound):
+        pass
+
+    @staticmethod
+    def get_default_path(def_path: str) -> str:
+        """
+        Returns the path to the latest checkpoint in the default location
+        :param def_path: default path where checkpoints are stored
+        :return: the path to the latest checkpoint
+        """
+        latest_subdir = max([os.path.join(def_path, d) for d in os.listdir(def_path)], key=os.path.getmtime)
+        default = latest_subdir + "/final.pth"
+        return default
+
+    def print_training_info(self, epoch, loss, cer, wer):
+        print(f"\nTraining Information\n " + \
+              f"- Epoch:\t{epoch}\n " + \
+              f"- Current Loss:\t{loss}\n " + \
+              f"- Current CER: \t{cer}\n" + \
+              f"- Current WER: \t{wer}")
diff --git a/sonosco/test.py b/sonosco/test.py
new file mode 100644
index 0000000..f4d8a39
--- /dev/null
+++ b/sonosco/test.py
@@ -0,0 +1,18 @@
+import argparse
+from typing import Dict
+
+import yaml
+
+from modelwrapper import ModelWrapper
+
+parser = argparse.ArgumentParser(description='ASR testing')
+parser.add_argument('--config', metavar='DIR',
+                    help='Path to test config file', default='config/test.yaml')
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    with open(args.config, 'r') as file:
+        config = yaml.load(file)
+    config_dict: Dict = config["test"]
+    model = ModelWrapper(**config_dict)
+    model.test()
diff --git a/sonosco/train.py b/sonosco/train.py
new file mode 100644
index 0000000..a763a47
--- /dev/null
+++ b/sonosco/train.py
@@ -0,0 +1,18 @@
+import argparse
+from typing import Dict
+
+import yaml
+
+from modelwrapper import ModelWrapper
+
+parser = argparse.ArgumentParser(description='ASR training')
+parser.add_argument('--config', metavar='DIR',
+                    help='Path to train config file', default='config/train.yaml')
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    with open(args.config, 'r') as file:
+        config = yaml.load(file)
+    config_dict: Dict = config["train"]
+    model = ModelWrapper(**config_dict)
+    model.train()
diff --git a/sonosco/utils.py b/sonosco/utils.py
new file mode 100644
index 0000000..5e219d1
--- /dev/null
+++ b/sonosco/utils.py
@@ -0,0 +1,51 @@
+import torch
+from apex.fp16_utils import BN_convert_float
+import torch.distributed as dist
+
+from models.deepspeech2 import DeepSpeech2
+
+
+def convert_model_to_half(model):
+    """
+    Converts model to half but keeps the batch norm layers in 32 bit for precision purposes
+    """
+    old_model = model
+    new_model = BN_convert_float(model.half())
+    del old_model  # Delete previous non-half model
+    return new_model
+
+
+def reduce_tensor(tensor, world_size, reduce_op_max=False):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.reduce_op.MAX if reduce_op_max is True else dist.reduce_op.SUM)  # Default to sum
+    if not reduce_op_max:
+        rt /= world_size
+    return rt
+
+
+def check_loss(loss, loss_value):
+    """
+    Check that warp-ctc loss is valid and will not break training
+    :return: Return if loss is valid, and the error in case it is not
+    """
+    loss_valid = True
+    error = ''
+    if loss_value == float("inf") or loss_value == float("-inf"):
+        loss_valid = False
+        error = "WARNING: received an inf loss"
+    elif torch.isnan(loss).sum() > 0:
+        loss_valid = False
+        error = 'WARNING: received a nan loss, setting loss value to 0'
+    elif loss_value < 0:
+        loss_valid = False
+        error = "WARNING: received a negative loss"
+    return loss_valid, error
+
+
+def load_model(device, model_path, is_cuda):
+    model = DeepSpeech.load_model(model_path)
+    model.eval()
+    model = model.to(device)
+    if is_cuda and model.mixed_precision:
+        model = convert_model_to_half(model)
+    return model

From ab254f587c2481503eed6e0dd25df6df012f997c Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Sat, 25 May 2019 17:32:20 +0200
Subject: [PATCH 04/58] init datasets folder, remove legacy code, split loader
 in multiple files

---
 sonosco/datasets/AudioDataLoader.py           |  36 +++++
 sonosco/datasets/AudioDataSampler.py          |  68 +++++++++
 sonosco/datasets/AudioDataset.py              | 137 ++++++++++++++++++
 sonosco/datasets/__init__.py                  |   0
 .../datasets/download_datasets/__init__.py    |   0
 sonosco/datasets/test_datasets.py             |  25 ++++
 6 files changed, 266 insertions(+)
 create mode 100644 sonosco/datasets/AudioDataLoader.py
 create mode 100644 sonosco/datasets/AudioDataSampler.py
 create mode 100644 sonosco/datasets/AudioDataset.py
 create mode 100644 sonosco/datasets/__init__.py
 create mode 100644 sonosco/datasets/download_datasets/__init__.py
 create mode 100644 sonosco/datasets/test_datasets.py

diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py
new file mode 100644
index 0000000..6c94f2c
--- /dev/null
+++ b/sonosco/datasets/AudioDataLoader.py
@@ -0,0 +1,36 @@
+
+import numpy as np
+import torch
+
+from torch.utils.data import Dataset, DataLoader, Sampler
+
+
+# TODO: Optimise
+def _collate_fn(batch):
+    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
+    longest_sample = batch[0][0]
+    freq_size, max_seqlength = longest_sample.size()
+    minibatch_size = len(batch)
+    inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
+    input_percentages = torch.FloatTensor(minibatch_size)
+    target_sizes = np.zeros(minibatch_size, dtype=np.int32)
+
+    # TODO: Numpy broadcasting magic
+    targets = []
+
+    for x in range(minibatch_size):
+        inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
+        input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
+        target_sizes[x] = len(batch[x][1])
+        targets.extend(batch[x][1])
+
+    return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
+
+
+class AudioDataLoader(DataLoader):
+    def __init__(self, *args, **kwargs):
+        """
+        Creates a data loader for AudioDatasets.
+        """
+        super(AudioDataLoader, self).__init__(*args, **kwargs)
+        self.collate_fn = _collate_fn
\ No newline at end of file
diff --git a/sonosco/datasets/AudioDataSampler.py b/sonosco/datasets/AudioDataSampler.py
new file mode 100644
index 0000000..100bde7
--- /dev/null
+++ b/sonosco/datasets/AudioDataSampler.py
@@ -0,0 +1,68 @@
+import math
+
+import numpy as np
+import torch
+from torch.utils.data import Sampler
+from torch.distributed.deprecated import get_rank
+from torch.distributed.deprecated import get_world_size
+
+class BucketingSampler(Sampler):
+    def __init__(self, data_source, batch_size=1):
+        """
+        Samples batches assuming they are in order of size to batch similarly sized samples together.
+        """
+        super(BucketingSampler, self).__init__(data_source)
+        self.data_source = data_source
+        ids = list(range(0, len(data_source)))
+        # TODO: Optimise
+        self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]
+
+    def __iter__(self):
+        for ids in self.bins:
+            np.random.shuffle(ids)
+            yield ids
+
+    def __len__(self):
+        return len(self.bins)
+
+    def shuffle(self, epoch):
+        np.random.shuffle(self.bins)
+
+
+# TODO: Optimise
+class DistributedBucketingSampler(Sampler):
+    def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
+        """
+        Samples batches assuming they are in order of size to batch similarly sized samples together.
+        """
+        super(DistributedBucketingSampler, self).__init__(data_source)
+        if num_replicas is None:
+            num_replicas = get_world_size()
+        if rank is None:
+            rank = get_rank()
+        self.data_source = data_source
+        self.ids = list(range(0, len(data_source)))
+        self.batch_size = batch_size
+        self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        offset = self.rank
+        # add extra samples to make it evenly divisible
+        bins = self.bins + self.bins[:(self.total_size - len(self.bins))]
+        assert len(bins) == self.total_size
+        samples = bins[offset::self.num_replicas]  # Get every Nth bin, starting from rank
+        return iter(samples)
+
+    def __len__(self):
+        return self.num_samples
+
+    def shuffle(self, epoch):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(epoch)
+        bin_ids = list(torch.randperm(len(self.bins), generator=g))
+        self.bins = [self.bins[i] for i in bin_ids]
\ No newline at end of file
diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py
new file mode 100644
index 0000000..7639723
--- /dev/null
+++ b/sonosco/datasets/AudioDataset.py
@@ -0,0 +1,137 @@
+# ----------------------------------------------------------------------------
+# Based on SeanNaren's deepspeech.pytorch:
+# https://github.com/SeanNaren/deepspeech.pytorch
+# ----------------------------------------------------------------------------
+
+import warnings
+from typing import Tuple
+
+import torch
+import torchaudio
+from scipy import signal
+from torch.utils.data import Dataset
+
+windows = {"bartlett": torch.bartlett_window,
+           "blackman": torch.blackman_window,
+           "hamming": torch.hamming_window,
+           "hann": torch.hann_window}
+
+
+class DataProcessor(object):
+    def __init__(self, audio_conf, labels="abc", normalize=False, augment=False):
+        """
+        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
+        a comma. Each new line is a different sample. Example below:
+        /path/to/audio.wav,/path/to/audio.txt
+        ...
+        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
+        :param labels: String containing all the possible characters to map to
+        :param normalize: Apply standard mean and deviation normalization to audio tensor
+        :param augment(default False):  Apply random tempo and gain perturbations
+        """
+        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
+        self.window_stride = audio_conf["window_stride"]
+        self.window_size = audio_conf["window_size"]
+        self.sample_rate = audio_conf["sample_rate"]
+        self.window = windows.get(audio_conf["window"], windows["hamming"])
+        self.normalize = normalize
+        self.augment = augment
+
+    @staticmethod
+    def retrieve_file(audio_path):
+        sound, sample_rate = torchaudio.load(audio_path)
+        return sound, sample_rate
+
+    @staticmethod
+    def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)):
+        """
+        Changes tempo and gain of the wave
+        """
+        warnings.warn("Augmentation is not implemented")  # TODO: Implement
+        return sound
+
+    def parse_audio(self, audio_path):
+        sound, sample_rate = self.retrieve_file(audio_path)
+        if sample_rate != self.sample_rate:
+            raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
+
+        if self.augment:
+            sound = self.augment_audio(sound)
+
+        #sound = sound.cuda()
+        spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()),
+                                  n_fft=int(self.sample_rate * self.window_size),
+                                  hop_length=int(self.sample_rate * self.window_stride),
+                                  win_length=int(self.sample_rate * self.window_size),
+                                  window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1]
+
+
+
+        if self.normalize:
+            mean = spectrogram.mean()
+            std = spectrogram.std()
+            spectrogram.add_(-mean)
+            spectrogram.div_(std)
+
+        return spectrogram
+
+    def parse_transcript(self, transcript_path):
+        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
+            transcript = transcript_file.read().replace('\n', '')
+        # TODO: Is it fast enough?
+        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
+        return transcript
+
+
+class AudioDataset(Dataset):
+    def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False):
+        """
+        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
+        a comma. Each new line is a different sample. Example below:
+        /path/to/audio.wav,/path/to/audio.txt
+        ...
+        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
+        :param manifest_filepath: Path to manifest csv as describe above
+        :param labels: String containing all the possible characters to map to
+        :param normalize: Apply standard mean and deviation normalization to audio tensor
+        :param augment(default False):  Apply random tempo and gain perturbations
+        """
+        super(AudioDataset, self).__init__()
+        with open(manifest_filepath) as f:
+            ids = f.readlines()
+        ids = [x.strip().split(',') for x in ids]
+        self.ids = ids
+        self.size = len(ids)
+        self.processor = DataProcessor(audio_conf, labels, normalize, augment)
+
+    def __getitem__(self, index):
+        sample = self.ids[index]
+        audio_path, transcript_path = sample[0], sample[1]
+
+        spectrogram = self.processor.parse_audio(audio_path)
+        transcript = self.processor.parse_transcript(transcript_path)
+
+        return spectrogram, transcript
+
+    def __len__(self):
+        return self.size
+
+def main():
+    audio_conf = dict(sample_rate=16000,
+                      window_size=.02,
+                      window_stride=.01,
+                      window='hamming')
+    test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv'
+    labels = 'abc'
+    test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels,
+                                 normalize=False, augment=False)
+    print("Dataset is created\n====================\n")
+
+    test = test_dataset[0]
+    batch_size = 16
+    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
+    dataloader = DataLoader(dataset=test_dataset, num_workers=4, collate_fn=_collate_fn, batch_sampler=sampler)
+    test_dataset[0]
+    #inputs, targets, input_percentages, target_sizes = next(iter(dataloader))
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/sonosco/datasets/__init__.py b/sonosco/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/datasets/download_datasets/__init__.py b/sonosco/datasets/download_datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/datasets/test_datasets.py b/sonosco/datasets/test_datasets.py
new file mode 100644
index 0000000..bc02b8d
--- /dev/null
+++ b/sonosco/datasets/test_datasets.py
@@ -0,0 +1,25 @@
+from AudioDataLoader import AudioDataLoader
+from AudioDataSampler import BucketingSampler, DistributedBucketingSampler
+from AudioDataset import AudioDataset
+
+
+def main():
+    audio_conf = dict(sample_rate=16000,
+                      window_size=.02,
+                      window_stride=.01,
+                      window='hamming')
+    test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv'
+    labels = 'abc'
+    test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels,
+                                 normalize=False, augment=False)
+    print("Dataset is created\n====================\n")
+
+    test = test_dataset[0]
+    batch_size = 16
+    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
+    dataloader = AudioDataLoader(test_dataset,num_workers=4, batch_sampler=sampler)
+
+    inputs, targets, input_percentages, target_sizes = next(iter(dataloader))
+    print(targets)
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From dcaa86c03c6a00c165f82eae8d900aa63b438beb Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Sat, 25 May 2019 17:58:06 +0200
Subject: [PATCH 05/58] rename testscript

---
 sonosco/datasets/AudioDataset.py                            | 4 +++-
 .../datasets/{test_datasets.py => datasets_test_script.py}  | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)
 rename sonosco/datasets/{test_datasets.py => datasets_test_script.py} (83%)

diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py
index 7639723..0a95284 100644
--- a/sonosco/datasets/AudioDataset.py
+++ b/sonosco/datasets/AudioDataset.py
@@ -78,8 +78,10 @@ def parse_audio(self, audio_path):
     def parse_transcript(self, transcript_path):
         with open(transcript_path, 'r', encoding='utf8') as transcript_file:
             transcript = transcript_file.read().replace('\n', '')
+            print(f"1: {transcript}")
         # TODO: Is it fast enough?
         transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
+        print(f"transcript_path: {transcript_path}\ntranscript: {transcript}")
         return transcript
 
 
@@ -122,7 +124,7 @@ def main():
                       window_stride=.01,
                       window='hamming')
     test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv'
-    labels = 'abc'
+    labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels,
                                  normalize=False, augment=False)
     print("Dataset is created\n====================\n")
diff --git a/sonosco/datasets/test_datasets.py b/sonosco/datasets/datasets_test_script.py
similarity index 83%
rename from sonosco/datasets/test_datasets.py
rename to sonosco/datasets/datasets_test_script.py
index bc02b8d..7f8a11f 100644
--- a/sonosco/datasets/test_datasets.py
+++ b/sonosco/datasets/datasets_test_script.py
@@ -4,6 +4,10 @@
 
 
 def main():
+    labels_path = "/Users/florianlay/roboy/sonosco/sonosco/datasets/labels.json"
+    with open(labels_path) as label_file:
+        labels = str(''.join(json.load(label_file)))
+
     audio_conf = dict(sample_rate=16000,
                       window_size=.02,
                       window_stride=.01,
@@ -20,6 +24,6 @@ def main():
     dataloader = AudioDataLoader(test_dataset,num_workers=4, batch_sampler=sampler)
 
     inputs, targets, input_percentages, target_sizes = next(iter(dataloader))
-    print(targets)
+    print(test)
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 02eb538f8befc825c6226dd857d91c18a1fbbc3b Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Sun, 26 May 2019 23:38:53 +0200
Subject: [PATCH 06/58] move collate fn in class

---
 sonosco/datasets/AudioDataLoader.py | 46 ++++++++++++++---------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py
index 6c94f2c..3161abd 100644
--- a/sonosco/datasets/AudioDataLoader.py
+++ b/sonosco/datasets/AudioDataLoader.py
@@ -4,33 +4,31 @@
 
 from torch.utils.data import Dataset, DataLoader, Sampler
 
-
-# TODO: Optimise
-def _collate_fn(batch):
-    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
-    longest_sample = batch[0][0]
-    freq_size, max_seqlength = longest_sample.size()
-    minibatch_size = len(batch)
-    inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
-    input_percentages = torch.FloatTensor(minibatch_size)
-    target_sizes = np.zeros(minibatch_size, dtype=np.int32)
-
-    # TODO: Numpy broadcasting magic
-    targets = []
-
-    for x in range(minibatch_size):
-        inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
-        input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
-        target_sizes[x] = len(batch[x][1])
-        targets.extend(batch[x][1])
-
-    return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
-
-
 class AudioDataLoader(DataLoader):
     def __init__(self, *args, **kwargs):
         """
         Creates a data loader for AudioDatasets.
         """
         super(AudioDataLoader, self).__init__(*args, **kwargs)
-        self.collate_fn = _collate_fn
\ No newline at end of file
+        self.collate_fn = self._collate_fn
+
+# TODO: Optimise
+    def _collate_fn(batch):
+        batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
+        longest_sample = batch[0][0]
+        freq_size, max_seqlength = longest_sample.size()
+        minibatch_size = len(batch)
+        inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
+        input_percentages = torch.FloatTensor(minibatch_size)
+        target_sizes = np.zeros(minibatch_size, dtype=np.int32)
+
+        # TODO: Numpy broadcasting magic
+        targets = []
+
+        for x in range(minibatch_size):
+            inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
+            input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
+            target_sizes[x] = len(batch[x][1])
+            targets.extend(batch[x][1])
+
+        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
\ No newline at end of file

From 08c7cdaac9d5e2837c976ac21012018ffe25861c Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Sun, 26 May 2019 23:51:39 +0200
Subject: [PATCH 07/58] add scripts from deepspeech to download datasets

---
 sonosco/datasets/download_datasets/an4.py     |  87 +++++++++++++
 .../download_datasets/common_voice.py         |  90 +++++++++++++
 .../datasets/download_datasets/librispeech.py | 113 ++++++++++++++++
 sonosco/datasets/download_datasets/ted3.py    | 123 ++++++++++++++++++
 .../datasets/download_datasets/voxforge.py    | 102 +++++++++++++++
 5 files changed, 515 insertions(+)
 create mode 100644 sonosco/datasets/download_datasets/an4.py
 create mode 100644 sonosco/datasets/download_datasets/common_voice.py
 create mode 100644 sonosco/datasets/download_datasets/librispeech.py
 create mode 100644 sonosco/datasets/download_datasets/ted3.py
 create mode 100644 sonosco/datasets/download_datasets/voxforge.py

diff --git a/sonosco/datasets/download_datasets/an4.py b/sonosco/datasets/download_datasets/an4.py
new file mode 100644
index 0000000..f810ee0
--- /dev/null
+++ b/sonosco/datasets/download_datasets/an4.py
@@ -0,0 +1,87 @@
+import argparse
+import os
+import io
+import shutil
+import tarfile
+import wget
+
+from utils import create_manifest
+
+parser = argparse.ArgumentParser(description='Processes and downloads an4.')
+parser.add_argument('--target-dir', default='an4_dataset/', help='Path to save dataset')
+parser.add_argument('--min-duration', default=1, type=int,
+                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
+parser.add_argument('--max-duration', default=15, type=int,
+                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
+args = parser.parse_args()
+
+
+def _format_data(root_path, data_tag, name, wav_folder):
+    data_path = args.target_dir + data_tag + '/' + name + '/'
+    new_transcript_path = data_path + '/txt/'
+    new_wav_path = data_path + '/wav/'
+
+    os.makedirs(new_transcript_path)
+    os.makedirs(new_wav_path)
+
+    wav_path = root_path + 'wav/'
+    file_ids = root_path + 'etc/an4_%s.fileids' % data_tag
+    transcripts = root_path + 'etc/an4_%s.transcription' % data_tag
+    train_path = wav_path + wav_folder
+
+    _convert_audio_to_wav(train_path)
+    _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path)
+
+
+def _convert_audio_to_wav(train_path):
+    with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe:
+        for line in pipe:
+            raw_path = line.strip()
+            new_path = line.replace('.raw', '.wav').strip()
+            cmd = 'sox -t raw -r %d -b 16 -e signed-integer -B -c 1 \"%s\" \"%s\"' % (
+                16000, raw_path, new_path)
+            os.system(cmd)
+
+
+def _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path):
+    with open(file_ids, 'r') as f:
+        with open(transcripts, 'r') as t:
+            paths = f.readlines()
+            transcripts = t.readlines()
+            for x in range(len(paths)):
+                path = wav_path + paths[x].strip() + '.wav'
+                filename = path.split('/')[-1]
+                extracted_transcript = _process_transcript(transcripts, x)
+                current_path = os.path.abspath(path)
+                new_path = new_wav_path + filename
+                text_path = new_transcript_path + filename.replace('.wav', '.txt')
+                with io.FileIO(text_path, "w") as file:
+                    file.write(extracted_transcript.encode('utf-8'))
+                os.rename(current_path, new_path)
+
+
+def _process_transcript(transcripts, x):
+    extracted_transcript = transcripts[x].split('(')[0].strip("<s>").split('<')[0].strip().upper()
+    return extracted_transcript
+
+
+def main():
+    root_path = 'an4/'
+    name = 'an4'
+    wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
+    tar = tarfile.open('an4_raw.bigendian.tar.gz')
+    tar.extractall()
+    os.makedirs(args.target_dir)
+    _format_data(root_path, 'train', name, 'an4_clstk')
+    _format_data(root_path, 'test', name, 'an4test_clstk')
+    shutil.rmtree(root_path)
+    os.remove('an4_raw.bigendian.tar.gz')
+    train_path = args.target_dir + '/train/'
+    test_path = args.target_dir + '/test/'
+    print ('\n', 'Creating manifests...')
+    create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration)
+    create_manifest(test_path, 'an4_val_manifest.csv')
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py
new file mode 100644
index 0000000..fbc7b91
--- /dev/null
+++ b/sonosco/datasets/download_datasets/common_voice.py
@@ -0,0 +1,90 @@
+import os
+import wget
+import tarfile
+import argparse
+import csv
+from multiprocessing.pool import ThreadPool
+import subprocess
+from utils import create_manifest
+
+parser = argparse.ArgumentParser(description='Downloads and processes Mozilla Common Voice dataset.')
+parser.add_argument("--target-dir", default='CommonVoice_dataset/', type=str, help="Directory to store the dataset.")
+parser.add_argument("--tar-path", type=str, help="Path to the Common Voice *.tar file if downloaded (Optional).")
+parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
+parser.add_argument('--min-duration', default=1, type=int,
+                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
+parser.add_argument('--max-duration', default=15, type=int,
+                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
+parser.add_argument('--files-to-process', default="cv-valid-dev.csv,cv-valid-test.csv,cv-valid-train.csv",
+                    type=str, help='list of *.csv file names to process')
+args = parser.parse_args()
+COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz"
+
+
+def convert_to_wav(csv_file, target_dir):
+    """ Read *.csv file description, convert mp3 to wav, process text.
+        Save results to target_dir.
+    Args:
+        csv_file: str, path to *.csv file with data description, usually start from 'cv-'
+        target_dir: str, path to dir to save results; wav/ and txt/ dirs will be created
+    """
+    wav_dir = os.path.join(target_dir, 'wav/')
+    txt_dir = os.path.join(target_dir, 'txt/')
+    os.makedirs(wav_dir, exist_ok=True)
+    os.makedirs(txt_dir, exist_ok=True)
+    path_to_data = os.path.dirname(csv_file)
+
+    def process(x):
+        file_path, text = x
+        file_name = os.path.splitext(os.path.basename(file_path))[0]
+        text = text.strip().upper()
+        with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f:
+            f.write(text)
+        cmd = "sox {} -r {} -b 16 -c 1 {}".format(
+            os.path.join(path_to_data, file_path),
+            args.sample_rate,
+            os.path.join(wav_dir, file_name + '.wav'))
+        subprocess.call([cmd], shell=True)
+
+    print('Converting mp3 to wav for {}.'.format(csv_file))
+    with open(csv_file) as csvfile:
+        reader = csv.DictReader(csvfile)
+        data = [(row['filename'], row['text']) for row in reader]
+        with ThreadPool(10) as pool:
+            pool.map(process, data)
+
+
+def main():
+    target_dir = args.target_dir
+    os.makedirs(target_dir, exist_ok=True)
+
+    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
+    os.makedirs(target_unpacked_dir, exist_ok=True)
+
+    if args.tar_path and os.path.exists(args.tar_path):
+        print('Find existing file {}'.format(args.tar_path))
+        target_file = args.tar_path
+    else:
+        print("Could not find downloaded Common Voice archive, Downloading corpus...")
+        filename = wget.download(COMMON_VOICE_URL, target_dir)
+        target_file = os.path.join(target_dir, os.path.basename(filename))
+
+    print("Unpacking corpus to {} ...".format(target_unpacked_dir))
+    tar = tarfile.open(target_file)
+    tar.extractall(target_unpacked_dir)
+    tar.close()
+
+    for csv_file in args.files_to_process.split(','):
+        convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file),
+                       os.path.join(target_dir, os.path.splitext(csv_file)[0]))
+
+    print('Creating manifests...')
+    for csv_file in args.files_to_process.split(','):
+        create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]),
+                        os.path.splitext(csv_file)[0] + '_manifest.csv',
+                        args.min_duration,
+                        args.max_duration)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
new file mode 100644
index 0000000..cc618a1
--- /dev/null
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -0,0 +1,113 @@
+import os
+import wget
+import tarfile
+import argparse
+import subprocess
+from utils import create_manifest
+from tqdm import tqdm
+import shutil
+
+parser = argparse.ArgumentParser(description='Processes and downloads LibriSpeech dataset.')
+parser.add_argument("--target-dir", default='LibriSpeech_dataset/', type=str, help="Directory to store the dataset.")
+parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
+parser.add_argument('--files-to-use', default="train-clean-100.tar.gz,"
+                                              "train-clean-360.tar.gz,train-other-500.tar.gz,"
+                                              "dev-clean.tar.gz,dev-other.tar.gz,"
+                                              "test-clean.tar.gz,test-other.tar.gz", type=str,
+                    help='list of file names to download')
+parser.add_argument('--min-duration', default=1, type=int,
+                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
+parser.add_argument('--max-duration', default=15, type=int,
+                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
+args = parser.parse_args()
+
+LIBRI_SPEECH_URLS = {
+    "train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz",
+              "http://www.openslr.org/resources/12/train-clean-360.tar.gz",
+              "http://www.openslr.org/resources/12/train-other-500.tar.gz"],
+
+    "val": ["http://www.openslr.org/resources/12/dev-clean.tar.gz",
+            "http://www.openslr.org/resources/12/dev-other.tar.gz"],
+
+    "test_clean": ["http://www.openslr.org/resources/12/test-clean.tar.gz"],
+    "test_other": ["http://www.openslr.org/resources/12/test-other.tar.gz"]
+}
+
+
+def _preprocess_transcript(phrase):
+    return phrase.strip().upper()
+
+
+def _process_file(wav_dir, txt_dir, base_filename, root_dir):
+    full_recording_path = os.path.join(root_dir, base_filename)
+    assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
+    wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
+    subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate),
+                                                          wav_recording_path)], shell=True)
+    # process transcript
+    txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
+    transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
+    assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file)
+    transcriptions = open(transcript_file).read().strip().split("\n")
+    transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
+    with open(txt_transcript_path, "w") as f:
+        key = base_filename.replace(".flac", "").split("-")[-1]
+        assert key in transcriptions, "{} is not in the transcriptions".format(key)
+        f.write(_preprocess_transcript(transcriptions[key]))
+        f.flush()
+
+
+def main():
+    target_dl_dir = args.target_dir
+    if not os.path.exists(target_dl_dir):
+        os.makedirs(target_dl_dir)
+    files_to_dl = args.files_to_use.strip().split(',')
+    for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
+        split_dir = os.path.join(target_dl_dir, split_type)
+        if not os.path.exists(split_dir):
+            os.makedirs(split_dir)
+        split_wav_dir = os.path.join(split_dir, "wav")
+        if not os.path.exists(split_wav_dir):
+            os.makedirs(split_wav_dir)
+        split_txt_dir = os.path.join(split_dir, "txt")
+        if not os.path.exists(split_txt_dir):
+            os.makedirs(split_txt_dir)
+        extracted_dir = os.path.join(split_dir, "LibriSpeech")
+        if os.path.exists(extracted_dir):
+            shutil.rmtree(extracted_dir)
+        for url in lst_libri_urls:
+            # check if we want to dl this file
+            dl_flag = False
+            for f in files_to_dl:
+                if url.find(f) != -1:
+                    dl_flag = True
+            if not dl_flag:
+                print("Skipping url: {}".format(url))
+                continue
+            filename = url.split("/")[-1]
+            target_filename = os.path.join(split_dir, filename)
+            if not os.path.exists(target_filename):
+                wget.download(url, split_dir)
+            print("Unpacking {}...".format(filename))
+            tar = tarfile.open(target_filename)
+            tar.extractall(split_dir)
+            tar.close()
+            os.remove(target_filename)
+            print("Converting flac files to wav and extracting transcripts...")
+            assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename)
+            for root, subdirs, files in tqdm(os.walk(extracted_dir)):
+                for f in files:
+                    if f.find(".flac") != -1:
+                        _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir,
+                                      base_filename=f, root_dir=root)
+
+            print("Finished {}".format(url))
+            shutil.rmtree(extracted_dir)
+        if split_type == 'train':  # Prune to min/max duration
+            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration)
+        else:
+            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/sonosco/datasets/download_datasets/ted3.py b/sonosco/datasets/download_datasets/ted3.py
new file mode 100644
index 0000000..c7d4b3e
--- /dev/null
+++ b/sonosco/datasets/download_datasets/ted3.py
@@ -0,0 +1,123 @@
+import os
+import wget
+import tarfile
+import argparse
+import subprocess
+import unicodedata
+import io
+from utils import create_manifest
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser(description='Processes and downloads TED-LIUMv3 dataset.')
+parser.add_argument("--target-dir", default='TEDLIUM3_dataset/', type=str, help="Directory to store the dataset.")
+parser.add_argument("--tar-path", type=str, help="Path to the TEDLIUM_release tar if downloaded (Optional).")
+parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
+parser.add_argument('--min-duration', default=1, type=int,
+                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
+parser.add_argument('--max-duration', default=15, type=int,
+                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
+args = parser.parse_args()
+
+TED_LIUM_V2_DL_URL = "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz"
+
+
+def get_utterances_from_stm(stm_file):
+    """
+    Return list of entries containing phrase and its start/end timings
+    :param stm_file:
+    :return:
+    """
+    res = []
+    with io.open(stm_file, "r", encoding='utf-8') as f:
+        for stm_line in f:
+            tokens = stm_line.split()
+            start_time = float(tokens[3])
+            end_time = float(tokens[4])
+            filename = tokens[0]
+            transcript = unicodedata.normalize("NFKD",
+                                               " ".join(t for t in tokens[6:]).strip()). \
+                encode("utf-8", "ignore").decode("utf-8", "ignore")
+            if transcript != "ignore_time_segment_in_scoring":
+                res.append({
+                    "start_time": start_time, "end_time": end_time,
+                    "filename": filename, "transcript": transcript
+                })
+        return res
+
+
+def cut_utterance(src_sph_file, target_wav_file, start_time, end_time, sample_rate=16000):
+    subprocess.call(["sox {}  -r {} -b 16 -c 1 {} trim {} ={}".format(src_sph_file, str(sample_rate),
+                                                                      target_wav_file, start_time, end_time)],
+                    shell=True)
+
+
+def _preprocess_transcript(phrase):
+    return phrase.strip().upper()
+
+
+def filter_short_utterances(utterance_info, min_len_sec=1.0):
+    return utterance_info["end_time"] - utterance_info["start_time"] > min_len_sec
+
+
+def prepare_dir(ted_dir):
+    converted_dir = os.path.join(ted_dir, "converted")
+    # directories to store converted wav files and their transcriptions
+    wav_dir = os.path.join(converted_dir, "wav")
+    if not os.path.exists(wav_dir):
+        os.makedirs(wav_dir)
+    txt_dir = os.path.join(converted_dir, "txt")
+    if not os.path.exists(txt_dir):
+        os.makedirs(txt_dir)
+    counter = 0
+    entries = os.listdir(os.path.join(ted_dir, "sph"))
+    for sph_file in tqdm(entries, total=len(entries)):
+        speaker_name = sph_file.split('.sph')[0]
+
+        sph_file_full = os.path.join(ted_dir, "sph", sph_file)
+        stm_file_full = os.path.join(ted_dir, "stm", "{}.stm".format(speaker_name))
+
+        assert os.path.exists(sph_file_full) and os.path.exists(stm_file_full)
+        all_utterances = get_utterances_from_stm(stm_file_full)
+
+        all_utterances = filter(filter_short_utterances, all_utterances)
+        for utterance_id, utterance in enumerate(all_utterances):
+            target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(utterance["filename"], str(utterance_id)))
+            target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(utterance["filename"], str(utterance_id)))
+            cut_utterance(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"],
+                          sample_rate=args.sample_rate)
+            with io.FileIO(target_txt_file, "w") as f:
+                f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8'))
+        counter += 1
+
+
+def main():
+    target_dl_dir = args.target_dir
+    #if not os.path.exists(target_dl_dir):
+    #    os.makedirs(target_dl_dir)
+
+    target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release-3")
+    #if args.tar_path and os.path.exists(args.tar_path):
+    #    target_file = args.tar_path
+    #else:
+    #    print("Could not find downloaded TEDLIUM archive, Downloading corpus...")
+    #    wget.download(TED_LIUM_V2_DL_URL, target_dl_dir)
+    #    target_file = os.path.join(target_dl_dir, "TEDLIUM_release-3.tgz")
+    #if not os.path.exists(target_unpacked_dir):
+    #    print("Unpacking corpus...")
+    #    tar = tarfile.open(target_file)
+    #    tar.extractall(target_dl_dir)
+    #    tar.close()
+    #else:
+    #    print("Found TEDLIUM directory, skipping unpacking of tar files")
+
+    train_ted_dir = os.path.join(target_unpacked_dir, "data")
+    train_ted_dir = os.path.join(train_ted_dir, "converted")
+
+    #prepare_dir(train_ted_dir)
+    print('Creating manifests...')
+
+    create_manifest(train_ted_dir, 'ted3_train_manifest.csv', args.min_duration, args.max_duration)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sonosco/datasets/download_datasets/voxforge.py b/sonosco/datasets/download_datasets/voxforge.py
new file mode 100644
index 0000000..a31febf
--- /dev/null
+++ b/sonosco/datasets/download_datasets/voxforge.py
@@ -0,0 +1,102 @@
+import os
+from six.moves import urllib
+import argparse
+import re
+import tempfile
+import shutil
+import subprocess
+import tarfile
+import io
+from tqdm import tqdm
+
+from utils import create_manifest
+
+VOXFORGE_URL_16kHz = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'
+
+parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.')
+parser.add_argument("--target-dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.")
+parser.add_argument('--sample-rate', default=16000,
+                    type=int, help='Sample rate')
+parser.add_argument('--min-duration', default=1, type=int,
+                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
+parser.add_argument('--max-duration', default=15, type=int,
+                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
+args = parser.parse_args()
+
+
+def _get_recordings_dir(sample_dir, recording_name):
+    wav_dir = os.path.join(sample_dir, recording_name, "wav")
+    if os.path.exists(wav_dir):
+        return "wav", wav_dir
+    flac_dir = os.path.join(sample_dir, recording_name, "flac")
+    if os.path.exists(flac_dir):
+        return "flac", flac_dir
+    raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name))
+
+
+def prepare_sample(recording_name, url, target_folder):
+    """
+    Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder.
+    """
+    wav_dir = os.path.join(target_folder, "wav")
+    if not os.path.exists(wav_dir):
+        os.makedirs(wav_dir)
+    txt_dir = os.path.join(target_folder, "txt")
+    if not os.path.exists(txt_dir):
+        os.makedirs(txt_dir)
+    # check if sample is processed
+    filename_set = set(['_'.join(wav_file.split('_')[:-1]) for wav_file in os.listdir(wav_dir)])
+    if recording_name in filename_set:
+        return
+
+    request = urllib.request.Request(url)
+    response = urllib.request.urlopen(request)
+    content = response.read()
+    response.close()
+    with tempfile.NamedTemporaryFile(suffix=".tgz", mode='wb') as target_tgz:
+        target_tgz.write(content)
+        target_tgz.flush()
+        dirpath = tempfile.mkdtemp()
+
+        tar = tarfile.open(target_tgz.name)
+        tar.extractall(dirpath)
+        tar.close()
+
+        recordings_type, recordings_dir = _get_recordings_dir(dirpath, recording_name)
+        tgz_prompt_file = os.path.join(dirpath, recording_name, "etc", "PROMPTS")
+
+        if os.path.exists(recordings_dir) and os.path.exists(tgz_prompt_file):
+            transcriptions = open(tgz_prompt_file).read().strip().split("\n")
+            transcriptions = {t.split()[0]: " ".join(t.split()[1:]) for t in transcriptions}
+            for wav_file in os.listdir(recordings_dir):
+                recording_id = wav_file.split('.{}'.format(recordings_type))[0]
+                transcription_key = recording_name + "/mfc/" + recording_id
+                if transcription_key not in transcriptions:
+                    continue
+                utterance = transcriptions[transcription_key]
+
+                target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
+                target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
+                with io.FileIO(target_txt_file, "w") as file:
+                    file.write(utterance.encode('utf-8'))
+                original_wav_file = os.path.join(recordings_dir, wav_file)
+                subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
+                                                                      target_wav_file)], shell=True)
+
+        shutil.rmtree(dirpath)
+
+
+if __name__ == '__main__':
+    target_dir = args.target_dir
+    sample_rate = args.sample_rate
+
+    if not os.path.isdir(target_dir):
+        os.makedirs(target_dir)
+    request = urllib.request.Request(VOXFORGE_URL_16kHz)
+    response = urllib.request.urlopen(request)
+    content = response.read()
+    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
+    for f in tqdm(all_files, total=len(all_files)):
+        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir)
+    print('Creating manifests...')
+    create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration)
\ No newline at end of file

From 31bc8125a7de4df0947b1a9b3870925f521d3c79 Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Mon, 27 May 2019 00:27:51 +0200
Subject: [PATCH 08/58] add wget to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index c7d87ee..19ce43b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ torchaudio==0.2
 torchvision==0.3.0
 tqdm==4.32.1
 pyyaml==5.1
+wget==3.2
 -e git+git@github.com:SeanNaren/warp-ctc.git@ab27454d0cf3f936a6e19165682fe2b759f3f8e5#egg=warpctc_pytorch&subdirectory=pytorch_binding

From fcae804043b880cf6aac39ebe73978456f3f242e Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Mon, 27 May 2019 00:28:30 +0200
Subject: [PATCH 09/58] modify librispeech script to save data to .temp

---
 .../datasets/download_datasets/librispeech.py | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index cc618a1..b9c8fc4 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -3,7 +3,7 @@
 import tarfile
 import argparse
 import subprocess
-from utils import create_manifest
+from data_utils import create_manifest
 from tqdm import tqdm
 import shutil
 
@@ -22,15 +22,15 @@
 args = parser.parse_args()
 
 LIBRI_SPEECH_URLS = {
-    "train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz",
-              "http://www.openslr.org/resources/12/train-clean-360.tar.gz",
-              "http://www.openslr.org/resources/12/train-other-500.tar.gz"],
+    #"train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz",
+    #          "http://www.openslr.org/resources/12/train-clean-360.tar.gz",
+    #          "http://www.openslr.org/resources/12/train-other-500.tar.gz"],
 
-    "val": ["http://www.openslr.org/resources/12/dev-clean.tar.gz",
-            "http://www.openslr.org/resources/12/dev-other.tar.gz"],
+    #"val": ["http://www.openslr.org/resources/12/dev-clean.tar.gz",
+    #        "http://www.openslr.org/resources/12/dev-other.tar.gz"],
 
-    "test_clean": ["http://www.openslr.org/resources/12/test-clean.tar.gz"],
-    "test_other": ["http://www.openslr.org/resources/12/test-other.tar.gz"]
+    "test_clean": ["http://www.openslr.org/resources/12/test-clean.tar.gz"]#,
+    #"test_other": ["http://www.openslr.org/resources/12/test-other.tar.gz"]
 }
 
 
@@ -58,12 +58,24 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir):
 
 
 def main():
-    target_dl_dir = args.target_dir
-    if not os.path.exists(target_dl_dir):
-        os.makedirs(target_dl_dir)
-    files_to_dl = args.files_to_use.strip().split(',')
+    root = os.path.expanduser('~')
+    data_path = '.temp/data/libri'
+
+    filenames = [
+        'train-clean-100.tar.gz',
+        'train-clean-360.tar.gz',
+        'train-other-500.tar.gz',
+        'dev-clean.tar.gz',
+        'dev-other.tar.gz',
+        'test-clean.tar.gz',
+        'test-other.tar.gz'
+    ]
+    path_to_data = os.path.join(root, data_path)
+    if not os.path.exists(path_to_data):
+        os.makedirs(path_to_data)
+
     for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
-        split_dir = os.path.join(target_dl_dir, split_type)
+        split_dir = os.path.join(path_to_data, split_type)
         if not os.path.exists(split_dir):
             os.makedirs(split_dir)
         split_wav_dir = os.path.join(split_dir, "wav")
@@ -78,7 +90,7 @@ def main():
         for url in lst_libri_urls:
             # check if we want to dl this file
             dl_flag = False
-            for f in files_to_dl:
+            for f in filenames:
                 if url.find(f) != -1:
                     dl_flag = True
             if not dl_flag:

From 7a5d5a51db9b38f0fc1141f4d6809dd62760827d Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Mon, 27 May 2019 00:29:04 +0200
Subject: [PATCH 10/58] add utils file and merge_manifest script from
 deepspeech

---
 .../datasets/download_datasets/data_utils.py  | 43 +++++++++++++++++++
 .../download_datasets/merge_manifests.py      | 31 +++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 sonosco/datasets/download_datasets/data_utils.py
 create mode 100644 sonosco/datasets/download_datasets/merge_manifests.py

diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py
new file mode 100644
index 0000000..ae2cc68
--- /dev/null
+++ b/sonosco/datasets/download_datasets/data_utils.py
@@ -0,0 +1,43 @@
+from __future__ import print_function
+
+import fnmatch
+import io
+import os
+from tqdm import tqdm
+import subprocess
+import torch.distributed as dist
+
+
+def create_manifest(data_path, output_path, min_duration=None, max_duration=None):
+    file_paths = [os.path.join(dirpath, f)
+                  for dirpath, dirnames, files in os.walk(data_path)
+                  for f in fnmatch.filter(files, '*.wav')]
+    file_paths = order_and_prune_files(file_paths, min_duration, max_duration)
+    with io.FileIO(output_path, "w") as file:
+        for wav_path in tqdm(file_paths, total=len(file_paths)):
+            transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
+            sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
+            file.write(sample.encode('utf-8'))
+    print('\n')
+
+
+def order_and_prune_files(file_paths, min_duration, max_duration):
+    print("Sorting manifests...")
+    duration_file_paths = [(path, float(subprocess.check_output(
+        ['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths]
+    if min_duration and max_duration:
+        print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration))
+        duration_file_paths = [(path, duration) for path, duration in duration_file_paths if
+                               min_duration <= duration <= max_duration]
+
+    def func(element):
+        return element[1]
+
+    duration_file_paths.sort(key=func)
+    return [x[0] for x in duration_file_paths]  # Remove durations
+
+def reduce_tensor(tensor, world_size):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    rt /= world_size
+    return rt
\ No newline at end of file
diff --git a/sonosco/datasets/download_datasets/merge_manifests.py b/sonosco/datasets/download_datasets/merge_manifests.py
new file mode 100644
index 0000000..e5e0fab
--- /dev/null
+++ b/sonosco/datasets/download_datasets/merge_manifests.py
@@ -0,0 +1,31 @@
+from __future__ import print_function
+
+import argparse
+import io
+import os
+
+from tqdm import tqdm
+from utils import order_and_prune_files
+
+parser = argparse.ArgumentParser(description='Merges all manifest CSV files in specified folder.')
+parser.add_argument('--merge-dir', default='manifests/', help='Path to all manifest files you want to merge')
+parser.add_argument('--min-duration', default=1, type=int,
+                    help='Prunes any samples shorter than the min duration (given in seconds, default 1)')
+parser.add_argument('--max-duration', default=15, type=int,
+                    help='Prunes any samples longer than the max duration (given in seconds, default 15)')
+parser.add_argument('--output-path', default='merged_manifest.csv', help='Output path to merged manifest')
+
+args = parser.parse_args()
+
+file_paths = []
+for file in os.listdir(args.merge_dir):
+    if file.endswith(".csv"):
+        with open(os.path.join(args.merge_dir, file), 'r') as fh:
+            file_paths += fh.readlines()
+file_paths = [file_path.split(',')[0] for file_path in file_paths]
+file_paths = order_and_prune_files(file_paths, args.min_duration, args.max_duration)
+with io.FileIO(args.output_path, "w") as file:
+    for wav_path in tqdm(file_paths, total=len(file_paths)):
+        transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
+        sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
+    file.write(sample.encode('utf-8'))
\ No newline at end of file

From d6163b755f99ded5015c382d7bec0f923d9b601d Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Mon, 27 May 2019 00:30:37 +0200
Subject: [PATCH 11/58] =?UTF-8?q?Minor=20fixed=CB=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt         |  2 +-
 sonosco/__init__.py      |  0
 sonosco/config/test.yaml |  2 +-
 sonosco/loader.py        |  6 ++++--
 sonosco/utils.py         | 10 ++++++++--
 5 files changed, 14 insertions(+), 6 deletions(-)
 create mode 100644 sonosco/__init__.py

diff --git a/requirements.txt b/requirements.txt
index 19ce43b..bba5fc7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-apex==0.1
+#apex==0.1
 audioread==2.1.7
 cycler==0.10.0
 decorator==4.4.0
diff --git a/sonosco/__init__.py b/sonosco/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/config/test.yaml b/sonosco/config/test.yaml
index 2589e15..7609ef1 100644
--- a/sonosco/config/test.yaml
+++ b/sonosco/config/test.yaml
@@ -4,6 +4,6 @@ test:
   batch_size: 32 # Batch size for testing
   num_workers: 4 # Number of workers used in loading
   verbose: True # Print out decoded output and error of each sample
-  save_output: Trur # Saves output of model from test
+  save_output: True # Saves output of model from test
   output_path: "" # Where to save raw acoustic output
 
diff --git a/sonosco/loader.py b/sonosco/loader.py
index 00b06d1..b73b065 100644
--- a/sonosco/loader.py
+++ b/sonosco/loader.py
@@ -13,8 +13,10 @@
 import torchaudio
 from scipy import signal
 from torch.utils.data import Dataset, DataLoader, Sampler
-from torch.distributed import get_rank
-from torch.distributed import get_world_size
+
+# FIXME: Deprecated functions usage
+from torch.distributed.deprecated import get_rank
+from torch.distributed.deprecated import get_world_size
 
 windows = {"bartlett": torch.bartlett_window,
            "blackman": torch.blackman_window,
diff --git a/sonosco/utils.py b/sonosco/utils.py
index 5e219d1..72d8353 100644
--- a/sonosco/utils.py
+++ b/sonosco/utils.py
@@ -1,5 +1,11 @@
 import torch
-from apex.fp16_utils import BN_convert_float
+
+try:
+    from apex.fp16_utils import BN_convert_float
+except Exception as e:
+    print(f"Apex import failed: {e}")
+
+
 import torch.distributed as dist
 
 from models.deepspeech2 import DeepSpeech2
@@ -43,7 +49,7 @@ def check_loss(loss, loss_value):
 
 
 def load_model(device, model_path, is_cuda):
-    model = DeepSpeech.load_model(model_path)
+    model = DeepSpeech2.load_model(model_path)
     model.eval()
     model = model.to(device)
     if is_cuda and model.mixed_precision:

From b4c70bbdb00865a537d2890317be14f410f4e311 Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Thu, 30 May 2019 19:11:47 +0200
Subject: [PATCH 12/58] Added install script plus slight refactornig

---
 .gitignore              |  2 +-
 install_dependencies.sh | 43 +++++++++++++++++++++++++++++++++++++++++
 post_requirements.txt   |  2 ++
 requirements.txt        |  3 ---
 setup.py                |  1 +
 5 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100755 install_dependencies.sh
 create mode 100644 post_requirements.txt

diff --git a/.gitignore b/.gitignore
index 7a0e43a..ba47f6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 # Created by .ignore support plugin (hsz.mobi)
 .idea/
-
+warp-ctc/
 ### Python template
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/install_dependencies.sh b/install_dependencies.sh
new file mode 100755
index 0000000..bea0827
--- /dev/null
+++ b/install_dependencies.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+#This scripts assumes that you have a virtual env in ./venv, you can override this by ./install_dependencies.sh -p /some/other/path
+
+set -e
+
+# define arguments
+for i in "$@"
+do
+case ${i} in
+    -c=*|--cuda=*)
+    CUDA="${i#*=}"
+    shift # past argument=value
+    ;;
+    -p=*|--python_path=*)
+    PYTHON_HOME_PATH="${i#*=}"
+    shift # past argument=value
+    ;;
+    *)
+          # unknown option
+    ;;
+esac
+done
+
+PYTHON_HOME_PATH=${PYTHON_HOME_PATH:-./venv}
+#TODO: Infer this automatically
+CUDA=${CUDA:-false}
+source ${PYTHON_HOME_PATH}/bin/activate
+
+pip install -r requirements.txt
+
+git clone https://github.com/SeanNaren/warp-ctc.git
+if [ "$CUDA" = false ] ; then
+    sed -i '' 's/option(WITH_OMP \"compile warp-ctc with openmp.\" ON)/option(WITH_OMP \"compile warp-ctc with openmp.\" ${CUDA_FOUND})/' warp-ctc/CMakeLists.txt
+else
+    export CUDA_HOME="/usr/local/cuda"
+fi
+cd warp-ctc; mkdir build; cd build; cmake ..; make
+cd ../pytorch_binding && python setup.py install
+cd ../..
+rm -rf warp-ctc
+
+pip install -r post_requirements.txt
\ No newline at end of file
diff --git a/post_requirements.txt b/post_requirements.txt
new file mode 100644
index 0000000..f4c027f
--- /dev/null
+++ b/post_requirements.txt
@@ -0,0 +1,2 @@
+-e git://github.com/pytorch/audio.git#egg=torchaudio-0.2
+-e git://github.com/NVIDIA/apex.git#egg=apex
diff --git a/requirements.txt b/requirements.txt
index bba5fc7..46e6bd6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-#apex==0.1
 audioread==2.1.7
 cycler==0.10.0
 decorator==4.4.0
@@ -17,9 +16,7 @@ scikit-learn==0.21.2
 scipy==1.3.0
 six==1.12.0
 torch==1.1.0
-torchaudio==0.2
 torchvision==0.3.0
 tqdm==4.32.1
 pyyaml==5.1
 wget==3.2
--e git+git@github.com:SeanNaren/warp-ctc.git@ab27454d0cf3f936a6e19165682fe2b759f3f8e5#egg=warpctc_pytorch&subdirectory=pytorch_binding
diff --git a/setup.py b/setup.py
index bdb1387..1dc7d35 100644
--- a/setup.py
+++ b/setup.py
@@ -6,4 +6,5 @@
     author="The Roboy Gang",
     packages=["sonosco"],
     include_package_data=True,
+    dependency_links=['http://github.com/pytorch/audio/tarball/master#egg=torchaudio-0.2']
 )

From 60f750030aa44393a50f90254f6b78ba3515e72c Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Thu, 30 May 2019 22:05:59 +0200
Subject: [PATCH 13/58] Added Dockerfile, started PyCandle integration

---
 .dockerignore             | 127 ++++++++++++++++++++++++++++++++++++++
 .gitignore                |   5 ++
 Dockerfile                |  38 ++++++++++++
 install_dependencies.sh   |  15 ++++-
 requirements.txt          |  42 ++++++-------
 sonosco/pycandle_train.py |  18 ++++++
 6 files changed, 221 insertions(+), 24 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile
 create mode 100644 sonosco/pycandle_train.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..f8f2779
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,127 @@
+# Created by .ignore support plugin (hsz.mobi)
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+### Example user template template
+### Example user template
+
+# IntelliJ project files
+.idea
+*.iml
+out
+gen
diff --git a/.gitignore b/.gitignore
index ba47f6f..58898a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,9 @@
 # Created by .ignore support plugin (hsz.mobi)
+sonosco/pycandle/
+sonosco/pycandle
+sonosco/datasets/download_datasets/
+!sonosco/datasets/download_datasets/*.py
+
 .idea/
 warp-ctc/
 ### Python template
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..8f57494
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,38 @@
+FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
+ARG CUDA=false
+
+
+WORKDIR /workspace/
+COPY . .
+# install basics
+RUN apt-get update -y
+RUN apt-get install -y git curl ca-certificates bzip2 cmake tree htop bmon iotop sox libsox-dev libsox-fmt-all vim wget
+
+ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+# install python deps
+RUN pip install -r requirements.txt
+
+RUN rm -rf warp-ctc
+RUN git clone https://github.com/SeanNaren/warp-ctc.git
+RUN if [ "$CUDA" = false ] ; then sed -i 's/option(WITH_OMP \"compile warp-ctc with openmp.\" ON)/option(WITH_OMP \"compile warp-ctc with openmp.\" ${CUDA_FOUND})/' warp-ctc/CMakeLists.txt ; else export CUDA_HOME="/usr/local/cuda" ; fi
+RUN cd warp-ctc; mkdir build; cd build; cmake ..; make
+RUN cd warp-ctc/pytorch_binding && python setup.py install
+RUN rm -rf warp-ctc
+
+RUN pip install -r post_requirements.txt
+
+
+#TODO: Do we need those two below?
+# install ctcdecode
+#RUN git clone --recursive https://github.com/parlance/ctcdecode.git
+#RUN cd ctcdecode; pip install .
+
+# install deepspeech.pytorch
+ADD . /workspace/deepspeech.pytorch
+RUN cd deepspeech.pytorch; pip install -r requirements.txt
+
+# launch jupiter
+RUN pip install jupyter
+RUN mkdir data; mkdir notebooks;
+CMD jupyter-notebook --ip="*" --no-browser --allow-root
\ No newline at end of file
diff --git a/install_dependencies.sh b/install_dependencies.sh
index bea0827..571c767 100755
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -12,8 +12,12 @@ case ${i} in
     CUDA="${i#*=}"
     shift # past argument=value
     ;;
+    -e=*|--venv=*)
+    VENV="${i#*=}"
+    shift # past argument=value
+    ;;
     -p=*|--python_path=*)
-    PYTHON_HOME_PATH="${i#*=}"
+    VENV_PATH="${i#*=}"
     shift # past argument=value
     ;;
     *)
@@ -22,10 +26,15 @@ case ${i} in
 esac
 done
 
-PYTHON_HOME_PATH=${PYTHON_HOME_PATH:-./venv}
+VENV=${VENV:-true}
+
+if [ "$VENV" = true ] ; then
+    VENV_PATH=${VENV_PATH:-./venv}
+    source ${VENV_PATH}/bin/activate
+fi
+
 #TODO: Infer this automatically
 CUDA=${CUDA:-false}
-source ${PYTHON_HOME_PATH}/bin/activate
 
 pip install -r requirements.txt
 
diff --git a/requirements.txt b/requirements.txt
index 46e6bd6..bd62925 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,22 @@
-audioread==2.1.7
-cycler==0.10.0
-decorator==4.4.0
-joblib==0.13.2
-kiwisolver==1.1.0
-librosa==0.6.3
-llvmlite==0.28.0
-matplotlib==3.1.0
-numba==0.43.1
-numpy==1.16.3
-Pillow==6.0.0
-pyparsing==2.4.0
-python-dateutil==2.8.0
-resampy==0.2.1
-scikit-learn==0.21.2
-scipy==1.3.0
-six==1.12.0
+#audioread==2.1.7
+#cycler==0.10.0
+#decorator==4.4.0
+#joblib==0.13.2
+#kiwisolver==1.1.0
+#librosa==0.6.3
+#llvmlite==0.28.0
+#matplotlib==3.1.0
+#numba==0.43.1
+#numpy==1.16.3
+#Pillow==6.0.0
+#pyparsing==2.4.0
+#python-dateutil==2.8.0
+#resampy==0.2.1
+#scikit-learn==0.21.2
+#scipy==1.3.0
+#six==1.12.0
 torch==1.1.0
-torchvision==0.3.0
-tqdm==4.32.1
-pyyaml==5.1
-wget==3.2
+#torchvision==0.3.0
+#tqdm==4.32.1
+#pyyaml==5.1
+#wget==3.2
diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py
new file mode 100644
index 0000000..9b08b19
--- /dev/null
+++ b/sonosco/pycandle_train.py
@@ -0,0 +1,18 @@
+import torch
+import torchvision
+import torch.nn.functional as F
+
+from sonosco.models.deepspeech2 import DeepSpeech2
+from sonosco.pycandle.general.experiment import Experiment
+from sonosco.pycandle.training.model_trainer import ModelTrainer
+
+
+def load_datasets(batch_size_train, batch_size_test):
+    pass
+
+model = DeepSpeech2().cuda()
+experiment = Experiment('mnist_example')
+train_loader, val_loader = load_datasets(batch_size_train=64, batch_size_test=64)
+optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0)
+model_trainer.start_training()

From 824e97bec1fb37f8a0e81700e7a0429b7530c61c Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sat, 1 Jun 2019 13:35:32 +0200
Subject: [PATCH 14/58] =?UTF-8?q?Added=20simple=20train=20function=CB=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                |  1 +
 install_dependencies.sh   |  1 +
 sonosco/pycandle_train.py | 25 ++++++++++++++++++++-----
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 58898a1..14dcffc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Created by .ignore support plugin (hsz.mobi)
 sonosco/pycandle/
 sonosco/pycandle
+sonosco/experiments/
 sonosco/datasets/download_datasets/
 !sonosco/datasets/download_datasets/*.py
 
diff --git a/install_dependencies.sh b/install_dependencies.sh
index 571c767..4cdbedd 100755
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -40,6 +40,7 @@ pip install -r requirements.txt
 
 git clone https://github.com/SeanNaren/warp-ctc.git
 if [ "$CUDA" = false ] ; then
+    # This works for mac, for other OSes remove '' after -i
     sed -i '' 's/option(WITH_OMP \"compile warp-ctc with openmp.\" ON)/option(WITH_OMP \"compile warp-ctc with openmp.\" ${CUDA_FOUND})/' warp-ctc/CMakeLists.txt
 else
     export CUDA_HOME="/usr/local/cuda"
diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py
index 9b08b19..ddb233e 100644
--- a/sonosco/pycandle_train.py
+++ b/sonosco/pycandle_train.py
@@ -1,18 +1,33 @@
 import torch
-import torchvision
 import torch.nn.functional as F
+import sonosco.datasets.download_datasets.librispeech as librispeech
 
+from sonosco.datasets.AudioDataLoader import AudioDataLoader
+from sonosco.datasets.AudioDataSampler import BucketingSampler
+from sonosco.datasets.AudioDataset import AudioDataset
 from sonosco.models.deepspeech2 import DeepSpeech2
 from sonosco.pycandle.general.experiment import Experiment
 from sonosco.pycandle.training.model_trainer import ModelTrainer
 
+def load_datasets(manifest_path, batch_size_train, batch_size_test):
+    audio_conf = dict(sample_rate=16000,
+                      window_size=.02,
+                      window_stride=.01,
+                      window='hamming')
+    labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=manifest_path, labels=labels,
+                                normalize=False, augment=False)
+    print("Dataset is created\n====================\n")
 
-def load_datasets(batch_size_train, batch_size_test):
-    pass
+    batch_size = 16
+    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
+    return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler)
 
-model = DeepSpeech2().cuda()
+librispeech.main()
+model = DeepSpeech2().cpu()
 experiment = Experiment('mnist_example')
-train_loader, val_loader = load_datasets(batch_size_train=64, batch_size_test=64)
+train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv",
+                                         batch_size_train=64, batch_size_test=64)
 optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0)
 model_trainer.start_training()

From 97d65ce3279163418ccb97ee2596488676346a9c Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sat, 1 Jun 2019 14:05:58 +0200
Subject: [PATCH 15/58] =?UTF-8?q?Import=20fixed=20=CB=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sonosco/datasets/download_datasets/librispeech.py |  2 +-
 sonosco/pycandle_train.py                         | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index b9c8fc4..31773cf 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -3,7 +3,7 @@
 import tarfile
 import argparse
 import subprocess
-from data_utils import create_manifest
+from datasets.download_datasets.data_utils import create_manifest
 from tqdm import tqdm
 import shutil
 
diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py
index ddb233e..2d02796 100644
--- a/sonosco/pycandle_train.py
+++ b/sonosco/pycandle_train.py
@@ -1,13 +1,13 @@
 import torch
 import torch.nn.functional as F
-import sonosco.datasets.download_datasets.librispeech as librispeech
+import datasets.download_datasets.librispeech as librispeech
 
-from sonosco.datasets.AudioDataLoader import AudioDataLoader
-from sonosco.datasets.AudioDataSampler import BucketingSampler
-from sonosco.datasets.AudioDataset import AudioDataset
-from sonosco.models.deepspeech2 import DeepSpeech2
-from sonosco.pycandle.general.experiment import Experiment
-from sonosco.pycandle.training.model_trainer import ModelTrainer
+from datasets.AudioDataLoader import AudioDataLoader
+from datasets.AudioDataSampler import BucketingSampler
+from datasets.AudioDataset import AudioDataset
+from models.deepspeech2 import DeepSpeech2
+from pycandle.general.experiment import Experiment
+from pycandle.training.model_trainer import ModelTrainer
 
 def load_datasets(manifest_path, batch_size_train, batch_size_test):
     audio_conf = dict(sample_rate=16000,

From d328973cc0370be6acca11ce28dbc07dedeec762 Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sat, 1 Jun 2019 15:52:32 +0200
Subject: [PATCH 16/58] Uncommented requirements

---
 requirements.txt | 42 +++++++++++++++++++++---------------------
 setup.py         |  2 +-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index bd62925..46e6bd6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,22 @@
-#audioread==2.1.7
-#cycler==0.10.0
-#decorator==4.4.0
-#joblib==0.13.2
-#kiwisolver==1.1.0
-#librosa==0.6.3
-#llvmlite==0.28.0
-#matplotlib==3.1.0
-#numba==0.43.1
-#numpy==1.16.3
-#Pillow==6.0.0
-#pyparsing==2.4.0
-#python-dateutil==2.8.0
-#resampy==0.2.1
-#scikit-learn==0.21.2
-#scipy==1.3.0
-#six==1.12.0
+audioread==2.1.7
+cycler==0.10.0
+decorator==4.4.0
+joblib==0.13.2
+kiwisolver==1.1.0
+librosa==0.6.3
+llvmlite==0.28.0
+matplotlib==3.1.0
+numba==0.43.1
+numpy==1.16.3
+Pillow==6.0.0
+pyparsing==2.4.0
+python-dateutil==2.8.0
+resampy==0.2.1
+scikit-learn==0.21.2
+scipy==1.3.0
+six==1.12.0
 torch==1.1.0
-#torchvision==0.3.0
-#tqdm==4.32.1
-#pyyaml==5.1
-#wget==3.2
+torchvision==0.3.0
+tqdm==4.32.1
+pyyaml==5.1
+wget==3.2
diff --git a/setup.py b/setup.py
index 1dc7d35..207c04e 100644
--- a/setup.py
+++ b/setup.py
@@ -6,5 +6,5 @@
     author="The Roboy Gang",
     packages=["sonosco"],
     include_package_data=True,
-    dependency_links=['http://github.com/pytorch/audio/tarball/master#egg=torchaudio-0.2']
+    dependency_links=[]
 )

From 993e5ecfa41042491f0455d6e63c669f8587588e Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sat, 1 Jun 2019 18:50:17 +0200
Subject: [PATCH 17/58] =?UTF-8?q?Fixed=20args=CB=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                          | 2 ++
 Dockerfile                          | 2 +-
 sonosco/datasets/AudioDataLoader.py | 4 ++--
 sonosco/pycandle_train.py           | 4 ++--
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 14dcffc..453b1ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@ sonosco/experiments/
 sonosco/datasets/download_datasets/
 !sonosco/datasets/download_datasets/*.py
 
+**/.DS_Store
+
 .idea/
 warp-ctc/
 ### Python template
diff --git a/Dockerfile b/Dockerfile
index 8f57494..3b9c9ab 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,4 +35,4 @@ RUN cd deepspeech.pytorch; pip install -r requirements.txt
 # launch jupiter
 RUN pip install jupyter
 RUN mkdir data; mkdir notebooks;
-CMD jupyter-notebook --ip="*" --no-browser --allow-root
\ No newline at end of file
+#CMD jupyter-notebook --ip="*" --no-browser --allow-root
\ No newline at end of file
diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py
index 3161abd..8786c2d 100644
--- a/sonosco/datasets/AudioDataLoader.py
+++ b/sonosco/datasets/AudioDataLoader.py
@@ -12,8 +12,8 @@ def __init__(self, *args, **kwargs):
         super(AudioDataLoader, self).__init__(*args, **kwargs)
         self.collate_fn = self._collate_fn
 
-# TODO: Optimise
-    def _collate_fn(batch):
+    # TODO: Optimise
+    def _collate_fn(self, batch):
         batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
         longest_sample = batch[0][0]
         freq_size, max_seqlength = longest_sample.size()
diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py
index 2d02796..7310789 100644
--- a/sonosco/pycandle_train.py
+++ b/sonosco/pycandle_train.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn.functional as F
-import datasets.download_datasets.librispeech as librispeech
+#import datasets.download_datasets.librispeech as librispeech
 
 from datasets.AudioDataLoader import AudioDataLoader
 from datasets.AudioDataSampler import BucketingSampler
@@ -23,7 +23,7 @@ def load_datasets(manifest_path, batch_size_train, batch_size_test):
     sampler = BucketingSampler(test_dataset, batch_size=batch_size)
     return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler)
 
-librispeech.main()
+# librispeech.main()
 model = DeepSpeech2().cpu()
 experiment = Experiment('mnist_example')
 train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv",

From 4135da1b7402d64d998e6ee854b4c82c8629ff01 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Tue, 4 Jun 2019 14:14:07 +0200
Subject: [PATCH 18/58] start refactoring librispeech data download

---
 sonosco/common/__init__.py                    |  0
 sonosco/common/audio_tools.py                 |  9 +++
 sonosco/common/click_extensions.py            | 16 ++++
 sonosco/common/utils.py                       | 21 +++++
 .../datasets/download_datasets/data_utils.py  | 35 ++++----
 .../datasets/download_datasets/librispeech.py | 80 +++++++++----------
 6 files changed, 102 insertions(+), 59 deletions(-)
 create mode 100644 sonosco/common/__init__.py
 create mode 100644 sonosco/common/audio_tools.py
 create mode 100644 sonosco/common/click_extensions.py
 create mode 100644 sonosco/common/utils.py

diff --git a/sonosco/common/__init__.py b/sonosco/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
new file mode 100644
index 0000000..21c811b
--- /dev/null
+++ b/sonosco/common/audio_tools.py
@@ -0,0 +1,9 @@
+import subprocess
+
+
+def get_duration(file_path):
+    return float(subprocess.check_output([f'soxi -D "{file_path.strip()}"'], shell=True))
+
+
+def transcode_recording(source, destination, sample_rate):
+    subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination}"], shell=True)
diff --git a/sonosco/common/click_extensions.py b/sonosco/common/click_extensions.py
new file mode 100644
index 0000000..18ff96f
--- /dev/null
+++ b/sonosco/common/click_extensions.py
@@ -0,0 +1,16 @@
+import click
+import ast
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+class PythonLiteralOption(click.Option):
+
+    def type_cast_value(self, ctx, value):
+        try:
+            return ast.literal_eval(value)
+        except Exception as e:
+            logger.error(f"Malformed click input for PythonLiteralOption {e}", exc_info=True)
+            raise click.BadParameter(value)
diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py
new file mode 100644
index 0000000..a570af0
--- /dev/null
+++ b/sonosco/common/utils.py
@@ -0,0 +1,21 @@
+import logging
+import os
+
+
+def setup_logging(logger: logging.Logger, filename=None, verbosity=False):
+    logger.setLevel(logging.DEBUG)
+    if filename is not None:
+        log_directory = os.path.dirname(filename)
+        if not os.path.exists(log_directory):
+            os.makedirs(log_directory)
+        filename = os.path.join(log_directory, f"{filename}.log")
+        f_handler = logging.FileHandler(filename=filename, mode="w")
+        f_handler.setLevel(logging.DEBUG)
+        f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        f_handler.setFormatter(f_format)
+        logger.addHandler(f_handler)
+    c_handler = logging.StreamHandler()
+    c_handler.setLevel(logging.DEBUG) if verbosity else c_handler.setLevel(logging.INFO)
+    c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    c_handler.setFormatter(c_format)
+    logger.addHandler(c_handler)
diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py
index ae2cc68..e7e15d5 100644
--- a/sonosco/datasets/download_datasets/data_utils.py
+++ b/sonosco/datasets/download_datasets/data_utils.py
@@ -1,43 +1,44 @@
-from __future__ import print_function
-
 import fnmatch
 import io
 import os
-from tqdm import tqdm
-import subprocess
+import logging
 import torch.distributed as dist
+import sonosco.common.audio_tools as audio_tools
+
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
 
 
 def create_manifest(data_path, output_path, min_duration=None, max_duration=None):
+    logger.info(f"Creating a manifest for path: {data_path}")
     file_paths = [os.path.join(dirpath, f)
                   for dirpath, dirnames, files in os.walk(data_path)
                   for f in fnmatch.filter(files, '*.wav')]
+    logger.info(f"Found {len(file_paths)} .wav files")
     file_paths = order_and_prune_files(file_paths, min_duration, max_duration)
     with io.FileIO(output_path, "w") as file:
         for wav_path in tqdm(file_paths, total=len(file_paths)):
             transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
-            sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
+            sample = f"{os.path.abspath(wav_path)},{os.path.abspath(transcript_path)}\n"
             file.write(sample.encode('utf-8'))
-    print('\n')
 
 
 def order_and_prune_files(file_paths, min_duration, max_duration):
-    print("Sorting manifests...")
-    duration_file_paths = [(path, float(subprocess.check_output(
-        ['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths]
+    logger.info("Sorting manifests...")
+    path_and_duration = [(path, audio_tools.get_duration(path)) for path in file_paths]
+
     if min_duration and max_duration:
-        print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration))
-        duration_file_paths = [(path, duration) for path, duration in duration_file_paths if
-                               min_duration <= duration <= max_duration]
+        logger.info(f"Pruning manifests between {min_duration} and {max_duration} seconds")
+        path_and_duration = [(path, duration) for path, duration in path_and_duration
+                             if min_duration <= duration <= max_duration]
 
-    def func(element):
-        return element[1]
+    path_and_duration.sort(key=lambda e: e[1])
+    return [x[0] for x in path_and_duration]
 
-    duration_file_paths.sort(key=func)
-    return [x[0] for x in duration_file_paths]  # Remove durations
 
 def reduce_tensor(tensor, world_size):
     rt = tensor.clone()
     dist.all_reduce(rt, op=dist.reduce_op.SUM)
     rt /= world_size
-    return rt
\ No newline at end of file
+    return rt
diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index 31773cf..7f8295b 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -1,25 +1,19 @@
 import os
+import click
 import wget
 import tarfile
-import argparse
-import subprocess
-from datasets.download_datasets.data_utils import create_manifest
-from tqdm import tqdm
 import shutil
+import logging
+import sonosco.common.audio_tools as audio_tools
+
+from sonosco.datasets.download_datasets.data_utils import create_manifest
+from sonosco.common.click_extensions import PythonLiteralOption
+from sonosco.common.utils import setup_logging
+from tqdm import tqdm
+
+
+logger = logging.getLogger("sonosco.datasets.download_datasets.librispeech")
 
-parser = argparse.ArgumentParser(description='Processes and downloads LibriSpeech dataset.')
-parser.add_argument("--target-dir", default='LibriSpeech_dataset/', type=str, help="Directory to store the dataset.")
-parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
-parser.add_argument('--files-to-use', default="train-clean-100.tar.gz,"
-                                              "train-clean-360.tar.gz,train-other-500.tar.gz,"
-                                              "dev-clean.tar.gz,dev-other.tar.gz,"
-                                              "test-clean.tar.gz,test-other.tar.gz", type=str,
-                    help='list of file names to download')
-parser.add_argument('--min-duration', default=1, type=int,
-                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
-parser.add_argument('--max-duration', default=15, type=int,
-                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
-args = parser.parse_args()
 
 LIBRI_SPEECH_URLS = {
     #"train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz",
@@ -38,12 +32,11 @@ def _preprocess_transcript(phrase):
     return phrase.strip().upper()
 
 
-def _process_file(wav_dir, txt_dir, base_filename, root_dir):
+def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate):
     full_recording_path = os.path.join(root_dir, base_filename)
     assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
     wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
-    subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate),
-                                                          wav_recording_path)], shell=True)
+    audio_tools.transcode_recording(full_recording_path, wav_recording_path, sample_rate)
     # process transcript
     txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
     transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
@@ -57,20 +50,22 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir):
         f.flush()
 
 
-def main():
-    root = os.path.expanduser('~')
-    data_path = '.temp/data/libri'
-
-    filenames = [
-        'train-clean-100.tar.gz',
-        'train-clean-360.tar.gz',
-        'train-other-500.tar.gz',
-        'dev-clean.tar.gz',
-        'dev-other.tar.gz',
-        'test-clean.tar.gz',
-        'test-other.tar.gz'
-    ]
-    path_to_data = os.path.join(root, data_path)
+@click.command()
+@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.")
+@click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
+@click.option("--files-to-use", multiple=True,
+              default=["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz",
+                       "dev-clean.tar.gz", "dev-other.tar.gz", "test-clean.tar.gz", "test-other.tar.gz"],
+              type=str, help="List of file names to download.")
+@click.option("--min-duration", default=1, type=int,
+              help="Prunes training samples shorter than the min duration (given in seconds).")
+@click.option("--max-duration", default=15, type=int,
+              help="Prunes training samples longer than the max duration (given in seconds).")
+def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
+    """Processes and downloads LibriSpeech dataset."""
+    setup_logging(logger)
+
+    path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
     if not os.path.exists(path_to_data):
         os.makedirs(path_to_data)
 
@@ -90,36 +85,37 @@ def main():
         for url in lst_libri_urls:
             # check if we want to dl this file
             dl_flag = False
-            for f in filenames:
+            for f in files_to_use:
                 if url.find(f) != -1:
                     dl_flag = True
             if not dl_flag:
-                print("Skipping url: {}".format(url))
+                logger.info(f"Skipping url: {url}")
                 continue
             filename = url.split("/")[-1]
             target_filename = os.path.join(split_dir, filename)
             if not os.path.exists(target_filename):
                 wget.download(url, split_dir)
-            print("Unpacking {}...".format(filename))
+            logger.info("Download complete")
+            logger.info(f"Unpacking {filename}...")
             tar = tarfile.open(target_filename)
             tar.extractall(split_dir)
             tar.close()
             os.remove(target_filename)
-            print("Converting flac files to wav and extracting transcripts...")
+            logger.info("Converting flac files to wav and extracting transcripts...")
             assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename)
             for root, subdirs, files in tqdm(os.walk(extracted_dir)):
                 for f in files:
                     if f.find(".flac") != -1:
                         _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir,
-                                      base_filename=f, root_dir=root)
+                                      base_filename=f, root_dir=root, sample_rate=sample_rate)
 
-            print("Finished {}".format(url))
+            logger.info(f"Finished {url}")
             shutil.rmtree(extracted_dir)
         if split_type == 'train':  # Prune to min/max duration
-            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration)
+            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', min_duration, max_duration)
         else:
             create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From e5ff4c3024c61985c431ec9265c014d856b702d2 Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Wed, 5 Jun 2019 01:40:55 +0200
Subject: [PATCH 19/58] Added conda local install, partialy adjusted model to
 pycandle

---
 install_dependencies.sh             | 22 ++++++++++++++--------
 post_requirements.txt               |  1 -
 sonosco/datasets/AudioDataLoader.py |  5 +++--
 sonosco/models/deepspeech2.py       | 12 ++++++++----
 sonosco/pycandle_train.py           |  2 +-
 5 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/install_dependencies.sh b/install_dependencies.sh
index 4cdbedd..d4ad0a0 100755
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -12,12 +12,12 @@ case ${i} in
     CUDA="${i#*=}"
     shift # past argument=value
     ;;
-    -e=*|--venv=*)
-    VENV="${i#*=}"
+    -a=*|--anaconda=*)
+    ANACONDA="${i#*=}"
     shift # past argument=value
     ;;
-    -p=*|--python_path=*)
-    VENV_PATH="${i#*=}"
+    -e=*|--venv=*)
+    VENV="${i#*=}"
     shift # past argument=value
     ;;
     *)
@@ -26,11 +26,12 @@ case ${i} in
 esac
 done
 
-VENV=${VENV:-true}
+VENV=${VENV:-./venv}
 
-if [ "$VENV" = true ] ; then
-    VENV_PATH=${VENV_PATH:-./venv}
-    source ${VENV_PATH}/bin/activate
+if [ -z ${ANACONDA+x} ] ; then
+    conda activate ${ANACONDA}
+elif [ -z ${VENV+x} ] ; then
+    source ${VENV}/bin/activate
 fi
 
 #TODO: Infer this automatically
@@ -50,4 +51,9 @@ cd ../pytorch_binding && python setup.py install
 cd ../..
 rm -rf warp-ctc
 
+git clone git@github.com:pytorch/audio.git
+cd audio; MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+cd ..
+rm -rf audio
+
 pip install -r post_requirements.txt
\ No newline at end of file
diff --git a/post_requirements.txt b/post_requirements.txt
index f4c027f..f750f4b 100644
--- a/post_requirements.txt
+++ b/post_requirements.txt
@@ -1,2 +1 @@
--e git://github.com/pytorch/audio.git#egg=torchaudio-0.2
 -e git://github.com/NVIDIA/apex.git#egg=apex
diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py
index 8786c2d..577d1ed 100644
--- a/sonosco/datasets/AudioDataLoader.py
+++ b/sonosco/datasets/AudioDataLoader.py
@@ -29,6 +29,7 @@ def _collate_fn(self, batch):
             inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
             input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
             target_sizes[x] = len(batch[x][1])
-            targets.extend(batch[x][1])
+            targets.append([batch[x][1]])
 
-        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
\ No newline at end of file
+        # return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
+        return (inputs, input_percentages), torch.IntTensor(targets)
\ No newline at end of file
diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py
index dcfa100..5d2dff8 100644
--- a/sonosco/models/deepspeech2.py
+++ b/sonosco/models/deepspeech2.py
@@ -82,13 +82,14 @@ def forward(self, input_):
 
 
 class BatchRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True):
+    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=True):
         super(BatchRNN, self).__init__()
+        self.bidirectional = bidirectional
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
         self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
-                            bidirectional=True, bias=True)
+                            bidirectional=self.bidirectional, bias=True)
 
     def flatten_parameters(self):
         self.rnn.flatten_parameters()
@@ -155,9 +156,12 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5
 
         self.inference_softmax = InferenceBatchSoftmax()
 
-    def forward(self, x, lengths):
+    def forward(self, xx):
         # if x.is_cuda and self.mixed_precision:
         #     x = x.half()
+        x, input_percentages = xx
+
+        lengths = input_percentages.mul_(int(x.size(3))).int()
         lengths = lengths.cpu().int()
         output_lengths = self.get_seq_lens(lengths)
         x, _ = self.conv(x, output_lengths)
@@ -176,7 +180,7 @@ def forward(self, x, lengths):
         x = x.transpose(0, 1)
         # identity in training mode, softmax in eval mode
         x = self.inference_softmax(x)
-        return x, output_lengths
+        return x
 
     def get_seq_lens(self, input_length):
         """
diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py
index 7310789..9645dfa 100644
--- a/sonosco/pycandle_train.py
+++ b/sonosco/pycandle_train.py
@@ -29,5 +29,5 @@ def load_datasets(manifest_path, batch_size_train, batch_size_test):
 train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv",
                                          batch_size_train=64, batch_size_test=64)
 optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0)
+model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=None)
 model_trainer.start_training()

From f0fc4a92e75c7532c60462fc9b9a45e68ec45d7e Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Wed, 5 Jun 2019 02:17:01 +0200
Subject: [PATCH 20/58] finish refactoring librispeech download

---
 sonosco/common/path_utils.py                  | 12 +++++++
 .../datasets/download_datasets/data_utils.py  |  1 +
 .../datasets/download_datasets/librispeech.py | 36 ++++++++++---------
 3 files changed, 33 insertions(+), 16 deletions(-)
 create mode 100644 sonosco/common/path_utils.py

diff --git a/sonosco/common/path_utils.py b/sonosco/common/path_utils.py
new file mode 100644
index 0000000..bf8c8ef
--- /dev/null
+++ b/sonosco/common/path_utils.py
@@ -0,0 +1,12 @@
+import os
+import wget
+
+
+def try_create_directory(path: str):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def try_download(destination: str, url: str):
+    if not os.path.exists(destination):
+        wget.download(url, destination)
diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py
index e7e15d5..e6a5cd8 100644
--- a/sonosco/datasets/download_datasets/data_utils.py
+++ b/sonosco/datasets/download_datasets/data_utils.py
@@ -7,6 +7,7 @@
 
 from tqdm import tqdm
 
+import pdb; pdb.set_trace()
 logger = logging.getLogger(__name__)
 
 
diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index 7f8295b..38b3f61 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -1,18 +1,17 @@
 import os
 import click
-import wget
 import tarfile
 import shutil
 import logging
 import sonosco.common.audio_tools as audio_tools
+import sonosco.common.path_utils as path_utils
 
 from sonosco.datasets.download_datasets.data_utils import create_manifest
-from sonosco.common.click_extensions import PythonLiteralOption
 from sonosco.common.utils import setup_logging
 from tqdm import tqdm
 
 
-logger = logging.getLogger("sonosco.datasets.download_datasets.librispeech")
+logger = logging.getLogger("sonosco")
 
 
 LIBRI_SPEECH_URLS = {
@@ -40,12 +39,12 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate):
     # process transcript
     txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
     transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
-    assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file)
+    assert os.path.exists(transcript_file), f"Transcript file {transcript_file} does not exist"
     transcriptions = open(transcript_file).read().strip().split("\n")
     transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
     with open(txt_transcript_path, "w") as f:
         key = base_filename.replace(".flac", "").split("-")[-1]
-        assert key in transcriptions, "{} is not in the transcriptions".format(key)
+        assert key in transcriptions, f"{key} is not in the transcriptions"
         f.write(_preprocess_transcript(transcriptions[key]))
         f.flush()
 
@@ -71,17 +70,16 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
 
     for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
         split_dir = os.path.join(path_to_data, split_type)
-        if not os.path.exists(split_dir):
-            os.makedirs(split_dir)
+        path_utils.try_create_directory(split_dir)
         split_wav_dir = os.path.join(split_dir, "wav")
-        if not os.path.exists(split_wav_dir):
-            os.makedirs(split_wav_dir)
+        path_utils.try_create_directory(split_wav_dir)
         split_txt_dir = os.path.join(split_dir, "txt")
-        if not os.path.exists(split_txt_dir):
-            os.makedirs(split_txt_dir)
+        path_utils.try_create_directory(split_txt_dir)
         extracted_dir = os.path.join(split_dir, "LibriSpeech")
+
         if os.path.exists(extracted_dir):
             shutil.rmtree(extracted_dir)
+
         for url in lst_libri_urls:
             # check if we want to dl this file
             dl_flag = False
@@ -91,18 +89,19 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
             if not dl_flag:
                 logger.info(f"Skipping url: {url}")
                 continue
+
             filename = url.split("/")[-1]
             target_filename = os.path.join(split_dir, filename)
-            if not os.path.exists(target_filename):
-                wget.download(url, split_dir)
+            path_utils.try_download(target_filename, url)
             logger.info("Download complete")
             logger.info(f"Unpacking {filename}...")
             tar = tarfile.open(target_filename)
             tar.extractall(split_dir)
             tar.close()
             os.remove(target_filename)
+            assert os.path.exists(extracted_dir), f"Archive {filename} was not properly uncompressed"
+
             logger.info("Converting flac files to wav and extracting transcripts...")
-            assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename)
             for root, subdirs, files in tqdm(os.walk(extracted_dir)):
                 for f in files:
                     if f.find(".flac") != -1:
@@ -111,10 +110,15 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
 
             logger.info(f"Finished {url}")
             shutil.rmtree(extracted_dir)
+
+        manifest_path = os.path.join(path_to_data, f"libri_{split_type}_manifest.csv")
+        if os.path.exists(manifest_path):
+            continue
+
         if split_type == 'train':  # Prune to min/max duration
-            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', min_duration, max_duration)
+            create_manifest(split_dir, manifest_path, min_duration, max_duration)
         else:
-            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
+            create_manifest(split_dir, manifest_path)
 
 
 if __name__ == "__main__":

From d849589b65b162122d1ebe1fc1a2fd0f66b78884 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Wed, 5 Jun 2019 02:29:59 +0200
Subject: [PATCH 21/58] move manifest to data dir

---
 sonosco/datasets/AudioDataLoader.py      | 9 ++++-----
 sonosco/datasets/AudioDataset.py         | 7 ++++++-
 sonosco/datasets/datasets_test_script.py | 6 +++++-
 sonosco/pycandle_train.py                | 7 +++++--
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py
index 8786c2d..f8f7337 100644
--- a/sonosco/datasets/AudioDataLoader.py
+++ b/sonosco/datasets/AudioDataLoader.py
@@ -1,14 +1,13 @@
-
 import numpy as np
 import torch
 
 from torch.utils.data import Dataset, DataLoader, Sampler
 
+
 class AudioDataLoader(DataLoader):
+
     def __init__(self, *args, **kwargs):
-        """
-        Creates a data loader for AudioDatasets.
-        """
+        """Creates a data loader for AudioDatasets."""
         super(AudioDataLoader, self).__init__(*args, **kwargs)
         self.collate_fn = self._collate_fn
 
@@ -31,4 +30,4 @@ def _collate_fn(self, batch):
             target_sizes[x] = len(batch[x][1])
             targets.extend(batch[x][1])
 
-        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
\ No newline at end of file
+        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py
index 0a95284..6aea72a 100644
--- a/sonosco/datasets/AudioDataset.py
+++ b/sonosco/datasets/AudioDataset.py
@@ -4,6 +4,7 @@
 # ----------------------------------------------------------------------------
 
 import warnings
+import os
 from typing import Tuple
 
 import torch
@@ -118,12 +119,14 @@ def __getitem__(self, index):
     def __len__(self):
         return self.size
 
+
 def main():
     audio_conf = dict(sample_rate=16000,
                       window_size=.02,
                       window_stride=.01,
                       window='hamming')
-    test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv'
+    manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech")
+    test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
     labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels,
                                  normalize=False, augment=False)
@@ -135,5 +138,7 @@ def main():
     dataloader = DataLoader(dataset=test_dataset, num_workers=4, collate_fn=_collate_fn, batch_sampler=sampler)
     test_dataset[0]
     #inputs, targets, input_percentages, target_sizes = next(iter(dataloader))
+
+
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/sonosco/datasets/datasets_test_script.py b/sonosco/datasets/datasets_test_script.py
index 7f8a11f..f4daf79 100644
--- a/sonosco/datasets/datasets_test_script.py
+++ b/sonosco/datasets/datasets_test_script.py
@@ -1,3 +1,5 @@
+import os
+
 from AudioDataLoader import AudioDataLoader
 from AudioDataSampler import BucketingSampler, DistributedBucketingSampler
 from AudioDataset import AudioDataset
@@ -12,7 +14,9 @@ def main():
                       window_size=.02,
                       window_stride=.01,
                       window='hamming')
-    test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv'
+
+    manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech")
+    test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
     labels = 'abc'
     test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels,
                                  normalize=False, augment=False)
diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py
index 7310789..fc1fbbe 100644
--- a/sonosco/pycandle_train.py
+++ b/sonosco/pycandle_train.py
@@ -1,4 +1,5 @@
 import torch
+import os
 import torch.nn.functional as F
 #import datasets.download_datasets.librispeech as librispeech
 
@@ -23,11 +24,13 @@ def load_datasets(manifest_path, batch_size_train, batch_size_test):
     sampler = BucketingSampler(test_dataset, batch_size=batch_size)
     return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler)
 
+
+manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech")
+test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
 # librispeech.main()
 model = DeepSpeech2().cpu()
 experiment = Experiment('mnist_example')
-train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv",
-                                         batch_size_train=64, batch_size_test=64)
+train_loader = load_datasets(test_manifest, batch_size_train=64, batch_size_test=64)
 optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0)
 model_trainer.start_training()

From 98f0943da99670a803cf1a6aa6b429082c7d845b Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Thu, 6 Jun 2019 00:48:14 +0200
Subject: [PATCH 22/58] start refactoring common_voice

---
 .../download_datasets/common_voice.py         | 83 ++++++++++---------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py
index fbc7b91..70451ab 100644
--- a/sonosco/datasets/download_datasets/common_voice.py
+++ b/sonosco/datasets/download_datasets/common_voice.py
@@ -1,27 +1,32 @@
 import os
 import wget
+import click
+import logging
 import tarfile
-import argparse
+import shutil
 import csv
+import sonosco.common.audio_tools as audio_tools
+import sonosco.common.path_utils as path_utils
 from multiprocessing.pool import ThreadPool
 import subprocess
-from utils import create_manifest
-
-parser = argparse.ArgumentParser(description='Downloads and processes Mozilla Common Voice dataset.')
-parser.add_argument("--target-dir", default='CommonVoice_dataset/', type=str, help="Directory to store the dataset.")
-parser.add_argument("--tar-path", type=str, help="Path to the Common Voice *.tar file if downloaded (Optional).")
-parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
-parser.add_argument('--min-duration', default=1, type=int,
-                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
-parser.add_argument('--max-duration', default=15, type=int,
-                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
-parser.add_argument('--files-to-process', default="cv-valid-dev.csv,cv-valid-test.csv,cv-valid-train.csv",
-                    type=str, help='list of *.csv file names to process')
-args = parser.parse_args()
+from sonosco.datasets.download_datasets.data_utils import create_manifest
+from sonosco.common.utils import setup_logging
+
+logger = logging.getLogger("sonosco")
+
 COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz"
 
+@click.command()
+@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.")
+@click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
+@click.option("--files-to-use", multiple=True,
+              default=["cv-valid-dev.csv","cv-valid-test.csv","cv-valid-train.csv"])
+@click.option("--min-duration", default=1, type=int,
+              help="Prunes training samples shorter than the min duration (given in seconds).")
+@click.option("--max-duration", default=15, type=int,
+              help="Prunes training samples longer than the max duration (given in seconds).")
 
-def convert_to_wav(csv_file, target_dir):
+def convert_to_wav(csv_file, target_dir, sample_rate):
     """ Read *.csv file description, convert mp3 to wav, process text.
         Save results to target_dir.
     Args:
@@ -30,8 +35,8 @@ def convert_to_wav(csv_file, target_dir):
     """
     wav_dir = os.path.join(target_dir, 'wav/')
     txt_dir = os.path.join(target_dir, 'txt/')
-    os.makedirs(wav_dir, exist_ok=True)
-    os.makedirs(txt_dir, exist_ok=True)
+    path_utils.try_create_directory(wav_dir)
+    path_utils.try_create_directory(txt_dir)
     path_to_data = os.path.dirname(csv_file)
 
     def process(x):
@@ -42,7 +47,7 @@ def process(x):
             f.write(text)
         cmd = "sox {} -r {} -b 16 -c 1 {}".format(
             os.path.join(path_to_data, file_path),
-            args.sample_rate,
+            sample_rate,
             os.path.join(wav_dir, file_name + '.wav'))
         subprocess.call([cmd], shell=True)
 
@@ -54,36 +59,40 @@ def process(x):
             pool.map(process, data)
 
 
-def main():
-    target_dir = args.target_dir
-    os.makedirs(target_dir, exist_ok=True)
+def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
+    setup_logging(logger)
+
+    path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
+    path_utils.try_create_directory(path_to_data)
 
-    target_unpacked_dir = os.path.join(target_dir, "CV_unpacked")
-    os.makedirs(target_unpacked_dir, exist_ok=True)
+    target_unpacked_dir = os.path.join(target_dir, "common_unpacked")
+    path_utils.try_create_directory(target_unpacked_dir)
 
-    if args.tar_path and os.path.exists(args.tar_path):
-        print('Find existing file {}'.format(args.tar_path))
-        target_file = args.tar_path
-    else:
-        print("Could not find downloaded Common Voice archive, Downloading corpus...")
-        filename = wget.download(COMMON_VOICE_URL, target_dir)
-        target_file = os.path.join(target_dir, os.path.basename(filename))
+    extracted_dir = os.path.join(path_to_data, "CommonVoice")
+    if os.path.exists(extracted_dir):
+        shutil.rmtree(extracted_dir)
+
+    path_utils.try_download(target_unpacked_dir, COMMON_VOICE_URL)
+
+    logger.info("Download complete")
+    logger.info("Unpacking...")
 
     print("Unpacking corpus to {} ...".format(target_unpacked_dir))
-    tar = tarfile.open(target_file)
-    tar.extractall(target_unpacked_dir)
+    tar = tarfile.open(target_unpacked_dir)
+    tar.extractall(extracted_dir)
     tar.close()
 
     for csv_file in args.files_to_process.split(','):
-        convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file),
-                       os.path.join(target_dir, os.path.splitext(csv_file)[0]))
+        convert_to_wav(os.path.join(extracted_dir, 'cv_corpus_v1/', csv_file),
+                       os.path.join(target_dir, os.path.splitext(csv_file)[0]),
+                       sample_rate)
 
     print('Creating manifests...')
-    for csv_file in args.files_to_process.split(','):
+    for csv_file in files_to_use.split(','):
         create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]),
                         os.path.splitext(csv_file)[0] + '_manifest.csv',
-                        args.min_duration,
-                        args.max_duration)
+                        min_duration,
+                        max_duration)
 
 
 if __name__ == "__main__":

From 2445d7133b6e7ade9d74e0edb231decc8349aa36 Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Thu, 6 Jun 2019 09:40:07 +0200
Subject: [PATCH 23/58] Removed pycandle, improved local install

---
 install_dependencies.sh             | 14 ++++++++---
 sonosco/datasets/AudioDataLoader.py |  5 ++--
 sonosco/models/deepspeech2.py       | 12 ++++------
 sonosco/pycandle_train.py           | 36 -----------------------------
 4 files changed, 17 insertions(+), 50 deletions(-)
 delete mode 100644 sonosco/pycandle_train.py

diff --git a/install_dependencies.sh b/install_dependencies.sh
index d4ad0a0..2be3b54 100755
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -1,6 +1,10 @@
 #!/usr/bin/env bash
 
-#This scripts assumes that you have a virtual env in ./venv, you can override this by ./install_dependencies.sh -p /some/other/path
+
+# Running without arguments -> installing into virtual env located in ./venv
+# -a=<conda env name> takes precedence before the virtual env and installs to conda env
+# -e=/path/to/venv installs in different venv then ./venv
+# -c=true installs with cuda support (default false)
 
 set -e
 
@@ -47,7 +51,7 @@ else
     export CUDA_HOME="/usr/local/cuda"
 fi
 cd warp-ctc; mkdir build; cd build; cmake ..; make
-cd ../pytorch_binding && python setup.py install
+cd ../pytorch_binding && MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
 cd ../..
 rm -rf warp-ctc
 
@@ -56,4 +60,8 @@ cd audio; MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py ins
 cd ..
 rm -rf audio
 
-pip install -r post_requirements.txt
\ No newline at end of file
+pip install -r post_requirements.txt
+
+if [ -f ./src/pip-delete-this-directory.txt ]; then
+    rm -rf ./src/
+fi
\ No newline at end of file
diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py
index d1dbccd..967b89c 100644
--- a/sonosco/datasets/AudioDataLoader.py
+++ b/sonosco/datasets/AudioDataLoader.py
@@ -28,7 +28,6 @@ def _collate_fn(self, batch):
             inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
             input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
             target_sizes[x] = len(batch[x][1])
-            targets.append([batch[x][1]])
+            targets.extend(batch[x][1])
 
-        # return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
-        return (inputs, input_percentages), torch.IntTensor(targets)
\ No newline at end of file
+        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
\ No newline at end of file
diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py
index 5d2dff8..dcfa100 100644
--- a/sonosco/models/deepspeech2.py
+++ b/sonosco/models/deepspeech2.py
@@ -82,14 +82,13 @@ def forward(self, input_):
 
 
 class BatchRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=True):
+    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True):
         super(BatchRNN, self).__init__()
-        self.bidirectional = bidirectional
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
         self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
-                            bidirectional=self.bidirectional, bias=True)
+                            bidirectional=True, bias=True)
 
     def flatten_parameters(self):
         self.rnn.flatten_parameters()
@@ -156,12 +155,9 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5
 
         self.inference_softmax = InferenceBatchSoftmax()
 
-    def forward(self, xx):
+    def forward(self, x, lengths):
         # if x.is_cuda and self.mixed_precision:
         #     x = x.half()
-        x, input_percentages = xx
-
-        lengths = input_percentages.mul_(int(x.size(3))).int()
         lengths = lengths.cpu().int()
         output_lengths = self.get_seq_lens(lengths)
         x, _ = self.conv(x, output_lengths)
@@ -180,7 +176,7 @@ def forward(self, xx):
         x = x.transpose(0, 1)
         # identity in training mode, softmax in eval mode
         x = self.inference_softmax(x)
-        return x
+        return x, output_lengths
 
     def get_seq_lens(self, input_length):
         """
diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py
deleted file mode 100644
index 71ec2ad..0000000
--- a/sonosco/pycandle_train.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import torch
-import os
-import torch.nn.functional as F
-#import datasets.download_datasets.librispeech as librispeech
-
-from datasets.AudioDataLoader import AudioDataLoader
-from datasets.AudioDataSampler import BucketingSampler
-from datasets.AudioDataset import AudioDataset
-from models.deepspeech2 import DeepSpeech2
-from pycandle.general.experiment import Experiment
-from pycandle.training.model_trainer import ModelTrainer
-
-def load_datasets(manifest_path, batch_size_train, batch_size_test):
-    audio_conf = dict(sample_rate=16000,
-                      window_size=.02,
-                      window_stride=.01,
-                      window='hamming')
-    labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-    test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=manifest_path, labels=labels,
-                                normalize=False, augment=False)
-    print("Dataset is created\n====================\n")
-
-    batch_size = 16
-    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
-    return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler)
-
-
-manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech")
-test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
-# librispeech.main()
-model = DeepSpeech2().cpu()
-experiment = Experiment('mnist_example')
-train_loader = load_datasets(test_manifest, batch_size_train=64, batch_size_test=64)
-optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=None)
-model_trainer.start_training()

From 7752aaf991f65dfb5a65343afc8b97cad7206aff Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Thu, 6 Jun 2019 11:29:17 +0200
Subject: [PATCH 24/58] refactoring

---
 sonosco/common/constants.py                   |  1 +
 sonosco/datasets/AudioDataset.py              |  2 -
 .../datasets/download_datasets/librispeech.py | 80 ++++++++++---------
 3 files changed, 43 insertions(+), 40 deletions(-)
 create mode 100644 sonosco/common/constants.py

diff --git a/sonosco/common/constants.py b/sonosco/common/constants.py
new file mode 100644
index 0000000..3d06bc6
--- /dev/null
+++ b/sonosco/common/constants.py
@@ -0,0 +1 @@
+SONOSCO = "sonosco"
\ No newline at end of file
diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py
index 6aea72a..0c271f7 100644
--- a/sonosco/datasets/AudioDataset.py
+++ b/sonosco/datasets/AudioDataset.py
@@ -66,8 +66,6 @@ def parse_audio(self, audio_path):
                                   win_length=int(self.sample_rate * self.window_size),
                                   window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1]
 
-
-
         if self.normalize:
             mean = spectrogram.mean()
             std = spectrogram.std()
diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index 38b3f61..9c0e6e4 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -8,10 +8,11 @@
 
 from sonosco.datasets.download_datasets.data_utils import create_manifest
 from sonosco.common.utils import setup_logging
+from sonosco.common.constants import *
 from tqdm import tqdm
 
 
-logger = logging.getLogger("sonosco")
+logger = logging.getLogger(SONOSCO)
 
 
 LIBRI_SPEECH_URLS = {
@@ -27,43 +28,7 @@
 }
 
 
-def _preprocess_transcript(phrase):
-    return phrase.strip().upper()
-
-
-def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate):
-    full_recording_path = os.path.join(root_dir, base_filename)
-    assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
-    wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
-    audio_tools.transcode_recording(full_recording_path, wav_recording_path, sample_rate)
-    # process transcript
-    txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
-    transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
-    assert os.path.exists(transcript_file), f"Transcript file {transcript_file} does not exist"
-    transcriptions = open(transcript_file).read().strip().split("\n")
-    transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
-    with open(txt_transcript_path, "w") as f:
-        key = base_filename.replace(".flac", "").split("-")[-1]
-        assert key in transcriptions, f"{key} is not in the transcriptions"
-        f.write(_preprocess_transcript(transcriptions[key]))
-        f.flush()
-
-
-@click.command()
-@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.")
-@click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
-@click.option("--files-to-use", multiple=True,
-              default=["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz",
-                       "dev-clean.tar.gz", "dev-other.tar.gz", "test-clean.tar.gz", "test-other.tar.gz"],
-              type=str, help="List of file names to download.")
-@click.option("--min-duration", default=1, type=int,
-              help="Prunes training samples shorter than the min duration (given in seconds).")
-@click.option("--max-duration", default=15, type=int,
-              help="Prunes training samples longer than the max duration (given in seconds).")
-def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
-    """Processes and downloads LibriSpeech dataset."""
-    setup_logging(logger)
-
+def try_download_librispeech(target_dir, sample_rate, files_to_use, min_duration, max_duration):
     path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
     if not os.path.exists(path_to_data):
         os.makedirs(path_to_data)
@@ -121,5 +86,44 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
             create_manifest(split_dir, manifest_path)
 
 
+def _preprocess_transcript(phrase):
+    return phrase.strip().upper()
+
+
+def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate):
+    full_recording_path = os.path.join(root_dir, base_filename)
+    assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
+    wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
+    audio_tools.transcode_recording(full_recording_path, wav_recording_path, sample_rate)
+    # process transcript
+    txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
+    transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
+    assert os.path.exists(transcript_file), f"Transcript file {transcript_file} does not exist"
+    transcriptions = open(transcript_file).read().strip().split("\n")
+    transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
+    with open(txt_transcript_path, "w") as f:
+        key = base_filename.replace(".flac", "").split("-")[-1]
+        assert key in transcriptions, f"{key} is not in the transcriptions"
+        f.write(_preprocess_transcript(transcriptions[key]))
+        f.flush()
+
+
+@click.command()
+@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.")
+@click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
+@click.option("--files-to-use", multiple=True,
+              default=["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz",
+                       "dev-clean.tar.gz", "dev-other.tar.gz", "test-clean.tar.gz", "test-other.tar.gz"],
+              type=str, help="List of file names to download.")
+@click.option("--min-duration", default=1, type=int,
+              help="Prunes training samples shorter than the min duration (given in seconds).")
+@click.option("--max-duration", default=15, type=int,
+              help="Prunes training samples longer than the max duration (given in seconds).")
+def main(**kwargs):
+    """Processes and downloads LibriSpeech dataset."""
+    setup_logging(logger)
+    try_download_librispeech(**kwargs)
+
+
 if __name__ == "__main__":
     main()

From 3968b613b92a069412b00e1b3f51fd653d51cc11 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Thu, 6 Jun 2019 12:46:55 +0200
Subject: [PATCH 25/58] start refactoring audio dataset

---
 sonosco/config/__init__.py                    |   0
 sonosco/config/global_settings.py             |   1 +
 sonosco/datasets/AudioDataset.py              | 107 ++++++++++--------
 .../datasets/download_datasets/data_utils.py  |   1 -
 .../datasets/download_datasets/librispeech.py |   4 +-
 5 files changed, 63 insertions(+), 50 deletions(-)
 create mode 100644 sonosco/config/__init__.py
 create mode 100644 sonosco/config/global_settings.py

diff --git a/sonosco/config/__init__.py b/sonosco/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/config/global_settings.py b/sonosco/config/global_settings.py
new file mode 100644
index 0000000..2f2522a
--- /dev/null
+++ b/sonosco/config/global_settings.py
@@ -0,0 +1 @@
+CUDA_ENABLED = False
\ No newline at end of file
diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py
index 0c271f7..098d195 100644
--- a/sonosco/datasets/AudioDataset.py
+++ b/sonosco/datasets/AudioDataset.py
@@ -5,39 +5,50 @@
 
 import warnings
 import os
-from typing import Tuple
-
+import logging
 import torch
 import torchaudio
-from scipy import signal
+import sonosco.config.global_settings as global_settings
+
+from typing import Tuple
 from torch.utils.data import Dataset
+from sonosco.common.utils import setup_logging
+from sonosco.common.constants import *
 
-windows = {"bartlett": torch.bartlett_window,
-           "blackman": torch.blackman_window,
-           "hamming": torch.hamming_window,
-           "hann": torch.hann_window}
 
+logger = logging.getLogger(__name__)
 
-class DataProcessor(object):
-    def __init__(self, audio_conf, labels="abc", normalize=False, augment=False):
+
+class DataProcessor:
+
+    def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False):
         """
         Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
         a comma. Each new line is a different sample. Example below:
         /path/to/audio.wav,/path/to/audio.txt
         ...
-        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
-        :param labels: String containing all the possible characters to map to
-        :param normalize: Apply standard mean and deviation normalization to audio tensor
-        :param augment(default False):  Apply random tempo and gain perturbations
+        :param window_stride: number of seconds to skip between each window
+        :param window_size: number of seconds to use for a window of spectrogram
+        :param sample_rate: sample rate of the recordings
+        :param labels: string containing all the possible characters to map to
+        :param normalize: apply standard mean and deviation normalization to audio tensor
+        :param augment(default False): apply random tempo and gain perturbations
         """
+        self.window_stride = window_stride
+        self.window_size = window_size
+        self.sample_rate = sample_rate
         self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
-        self.window_stride = audio_conf["window_stride"]
-        self.window_size = audio_conf["window_size"]
-        self.sample_rate = audio_conf["sample_rate"]
-        self.window = windows.get(audio_conf["window"], windows["hamming"])
         self.normalize = normalize
         self.augment = augment
 
+    @property
+    def window_stride_samples(self):
+        return int(self.sample_rate * self.window_stride)
+
+    @property
+    def window_size_samples(self):
+        return int(self.sample_rate * self.window_stride)
+
     @staticmethod
     def retrieve_file(audio_path):
         sound, sample_rate = torchaudio.load(audio_path)
@@ -45,57 +56,52 @@ def retrieve_file(audio_path):
 
     @staticmethod
     def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)):
-        """
-        Changes tempo and gain of the wave
-        """
+        """Changes tempo and gain of the wave."""
         warnings.warn("Augmentation is not implemented")  # TODO: Implement
         return sound
 
     def parse_audio(self, audio_path):
         sound, sample_rate = self.retrieve_file(audio_path)
+
         if sample_rate != self.sample_rate:
             raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
 
         if self.augment:
             sound = self.augment_audio(sound)
 
-        #sound = sound.cuda()
-        spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()),
-                                  n_fft=int(self.sample_rate * self.window_size),
-                                  hop_length=int(self.sample_rate * self.window_stride),
-                                  win_length=int(self.sample_rate * self.window_size),
-                                  window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1]
+        if global_settings.CUDA_ENABLED:
+            sound = sound.cuda()
 
-        if self.normalize:
-            mean = spectrogram.mean()
-            std = spectrogram.std()
-            spectrogram.add_(-mean)
-            spectrogram.div_(std)
+        # TODO: comment why take the last element?
+        spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()),
+                                 n_fft=self.window_size_samples,
+                                 hop_length=self.window_stride_samples,
+                                 win_length=self.window_size_samples,
+                                 window=torch.hamming_window(self.window_size_samples),
+                                 normalized=self.normalize)[:, :, -1]
 
         return spectrogram
 
     def parse_transcript(self, transcript_path):
         with open(transcript_path, 'r', encoding='utf8') as transcript_file:
             transcript = transcript_file.read().replace('\n', '')
-            print(f"1: {transcript}")
+            logger.info(f"1: {transcript}")
         # TODO: Is it fast enough?
         transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
-        print(f"transcript_path: {transcript_path}\ntranscript: {transcript}")
+        logger.info(f"transcript_path: {transcript_path} transcript: {transcript}")
         return transcript
 
 
 class AudioDataset(Dataset):
-    def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False):
+
+    def __init__(self, processor: DataProcessor, manifest_filepath):
         """
         Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
         a comma. Each new line is a different sample. Example below:
         /path/to/audio.wav,/path/to/audio.txt
         ...
-        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
+        :param processor: Data processor object
         :param manifest_filepath: Path to manifest csv as describe above
-        :param labels: String containing all the possible characters to map to
-        :param normalize: Apply standard mean and deviation normalization to audio tensor
-        :param augment(default False):  Apply random tempo and gain perturbations
         """
         super(AudioDataset, self).__init__()
         with open(manifest_filepath) as f:
@@ -103,7 +109,7 @@ def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augme
         ids = [x.strip().split(',') for x in ids]
         self.ids = ids
         self.size = len(ids)
-        self.processor = DataProcessor(audio_conf, labels, normalize, augment)
+        self.processor = processor
 
     def __getitem__(self, index):
         sample = self.ids[index]
@@ -119,17 +125,22 @@ def __len__(self):
 
 
 def main():
-    audio_conf = dict(sample_rate=16000,
-                      window_size=.02,
-                      window_stride=.01,
-                      window='hamming')
+    global logger
+    logger = logging.getLogger(SONOSCO)
+    setup_logging(logger)
+
+    # create data processor
+    audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01,
+                      labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False)
+    processor = DataProcessor(**audio_conf)
+
+    # get manifest file
     manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech")
     test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
-    labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-    test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels,
-                                 normalize=False, augment=False)
-    print("Dataset is created\n====================\n")
 
+    # create audio dataset
+    test_dataset = AudioDataset(processor, manifest_filepath=test_manifest)
+    logger.info("Dataset is created")
     test = test_dataset[0]
     batch_size = 16
     sampler = BucketingSampler(test_dataset, batch_size=batch_size)
@@ -139,4 +150,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py
index e6a5cd8..e7e15d5 100644
--- a/sonosco/datasets/download_datasets/data_utils.py
+++ b/sonosco/datasets/download_datasets/data_utils.py
@@ -7,7 +7,6 @@
 
 from tqdm import tqdm
 
-import pdb; pdb.set_trace()
 logger = logging.getLogger(__name__)
 
 
diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index 9c0e6e4..59df937 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -12,7 +12,7 @@
 from tqdm import tqdm
 
 
-logger = logging.getLogger(SONOSCO)
+logger = logging.getLogger(__name__)
 
 
 LIBRI_SPEECH_URLS = {
@@ -121,6 +121,8 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate):
               help="Prunes training samples longer than the max duration (given in seconds).")
 def main(**kwargs):
     """Processes and downloads LibriSpeech dataset."""
+    global logger
+    logger = logging.getLogger(SONOSCO)
     setup_logging(logger)
     try_download_librispeech(**kwargs)
 

From 028f1aeda36d60b9cf3921928dd34ed379b0b179 Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Tue, 11 Jun 2019 17:49:27 +0200
Subject: [PATCH 26/58] adapt common_voice to click logger [Needs test with
 good internet connectivity]

---
 .../download_datasets/common_voice.py         | 82 ++++++++++---------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py
index 70451ab..e038dab 100644
--- a/sonosco/datasets/download_datasets/common_voice.py
+++ b/sonosco/datasets/download_datasets/common_voice.py
@@ -11,13 +11,14 @@
 import subprocess
 from sonosco.datasets.download_datasets.data_utils import create_manifest
 from sonosco.common.utils import setup_logging
+from sonosco.common.constants import *
 
-logger = logging.getLogger("sonosco")
+logger = logging.getLogger(__name__)
 
 COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz"
 
 @click.command()
-@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.")
+@click.option("--target-dir", default="temp/data/common_voice", type=str, help="Directory to store the dataset.")
 @click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
 @click.option("--files-to-use", multiple=True,
               default=["cv-valid-dev.csv","cv-valid-test.csv","cv-valid-train.csv"])
@@ -25,6 +26,44 @@
               help="Prunes training samples shorter than the min duration (given in seconds).")
 @click.option("--max-duration", default=15, type=int,
               help="Prunes training samples longer than the max duration (given in seconds).")
+def try_download_common_voice(target_dir, sample_rate, files_to_use, min_duration, max_duration):
+    path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
+    path_utils.try_create_directory(path_to_data)
+
+    target_unpacked_dir = os.path.join(path_to_data, "common_unpacked")
+    path_utils.try_create_directory(target_unpacked_dir)
+
+    extracted_dir = os.path.join(path_to_data, "CommonVoice")
+    if os.path.exists(extracted_dir):
+        shutil.rmtree(extracted_dir)
+    logger.info("Start downloading...")
+    file_name = COMMON_VOICE_URL.split("/")[-1]
+    target_filename = os.path.join(target_unpacked_dir, file_name)
+    path_utils.try_download(target_filename, COMMON_VOICE_URL)
+
+    logger.info("Download complete")
+    logger.info("Unpacking...")
+    tar = tarfile.open(target_filename)
+    tar.extractall(extracted_dir)
+    tar.close()
+    os.remove(target_unpacked_dir)
+    assert os.path.exists(extracted_dir), f"Archive {file_name} was not properly uncompressed"
+    logger.info("Converting files to wav and extracting transcripts...")
+    for csv_file in files_to_use.split(','):
+        convert_to_wav(os.path.join(extracted_dir, 'cv_corpus_v1/', csv_file),
+                       os.path.join(target_dir, os.path.splitext(csv_file)[0]),
+                       sample_rate)
+    logger.info(f"Finished {COMMON_VOICE_URL}")
+    shutil.rmtree(extracted_dir)
+
+    logger.info('Creating manifests...')
+    for csv_file in files_to_use.split(','):
+        create_manifest(os.path.join(path_to_data, os.path.splitext(csv_file)[0]),
+                        os.path.splitext(csv_file)[0] + '_manifest.csv',
+                        min_duration,
+                        max_duration)
+
+
 
 def convert_to_wav(csv_file, target_dir, sample_rate):
     """ Read *.csv file description, convert mp3 to wav, process text.
@@ -51,48 +90,17 @@ def process(x):
             os.path.join(wav_dir, file_name + '.wav'))
         subprocess.call([cmd], shell=True)
 
-    print('Converting mp3 to wav for {}.'.format(csv_file))
+    logger.ino('Converting mp3 to wav for {}.'.format(csv_file))
     with open(csv_file) as csvfile:
         reader = csv.DictReader(csvfile)
         data = [(row['filename'], row['text']) for row in reader]
         with ThreadPool(10) as pool:
             pool.map(process, data)
 
-
-def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
+def main(**kwargs):
+    logger = logging.getLogger(SONOSCO)
     setup_logging(logger)
-
-    path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
-    path_utils.try_create_directory(path_to_data)
-
-    target_unpacked_dir = os.path.join(target_dir, "common_unpacked")
-    path_utils.try_create_directory(target_unpacked_dir)
-
-    extracted_dir = os.path.join(path_to_data, "CommonVoice")
-    if os.path.exists(extracted_dir):
-        shutil.rmtree(extracted_dir)
-
-    path_utils.try_download(target_unpacked_dir, COMMON_VOICE_URL)
-
-    logger.info("Download complete")
-    logger.info("Unpacking...")
-
-    print("Unpacking corpus to {} ...".format(target_unpacked_dir))
-    tar = tarfile.open(target_unpacked_dir)
-    tar.extractall(extracted_dir)
-    tar.close()
-
-    for csv_file in args.files_to_process.split(','):
-        convert_to_wav(os.path.join(extracted_dir, 'cv_corpus_v1/', csv_file),
-                       os.path.join(target_dir, os.path.splitext(csv_file)[0]),
-                       sample_rate)
-
-    print('Creating manifests...')
-    for csv_file in files_to_use.split(','):
-        create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]),
-                        os.path.splitext(csv_file)[0] + '_manifest.csv',
-                        min_duration,
-                        max_duration)
+    try_download_common_voice(**kwargs)
 
 
 if __name__ == "__main__":

From b0ffed4a378bcbe1d522ba6ec2f4f8e7bcf50b3e Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Wed, 12 Jun 2019 09:16:24 +0200
Subject: [PATCH 27/58] add function for an4 dataset to audio_tools

---
 sonosco/common/audio_tools.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
index 21c811b..77befb6 100644
--- a/sonosco/common/audio_tools.py
+++ b/sonosco/common/audio_tools.py
@@ -7,3 +7,6 @@ def get_duration(file_path):
 
 def transcode_recording(source, destination, sample_rate):
     subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination}"], shell=True)
+
+def transcode_recordings_an4(raw_path, wav_path, sample_rate):
+    subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True)
\ No newline at end of file

From 52a24cd2537029eda7817ad65a4c4fb2f9ce59bf Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Wed, 12 Jun 2019 09:17:09 +0200
Subject: [PATCH 28/58] add audio_tools call to common voice

---
 sonosco/datasets/download_datasets/common_voice.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py
index e038dab..7ea1eff 100644
--- a/sonosco/datasets/download_datasets/common_voice.py
+++ b/sonosco/datasets/download_datasets/common_voice.py
@@ -84,11 +84,7 @@ def process(x):
         text = text.strip().upper()
         with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f:
             f.write(text)
-        cmd = "sox {} -r {} -b 16 -c 1 {}".format(
-            os.path.join(path_to_data, file_path),
-            sample_rate,
-            os.path.join(wav_dir, file_name + '.wav'))
-        subprocess.call([cmd], shell=True)
+        audio_tools(source = os.path.join(path_to_data, file_path), destination=os.path.join(wav_dir, file_name + '.wav'), sample_rate = sample_rate)
 
     logger.ino('Converting mp3 to wav for {}.'.format(csv_file))
     with open(csv_file) as csvfile:

From d3796279c59883d4002ce14dd5cdf078775841a5 Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Wed, 12 Jun 2019 09:18:00 +0200
Subject: [PATCH 29/58] adapt an4 to new style and make it work.

---
 sonosco/datasets/download_datasets/an4.py | 133 +++++++++++++---------
 1 file changed, 77 insertions(+), 56 deletions(-)

diff --git a/sonosco/datasets/download_datasets/an4.py b/sonosco/datasets/download_datasets/an4.py
index f810ee0..a36f952 100644
--- a/sonosco/datasets/download_datasets/an4.py
+++ b/sonosco/datasets/download_datasets/an4.py
@@ -1,60 +1,84 @@
-import argparse
 import os
+import click
 import io
 import shutil
 import tarfile
-import wget
-
-from utils import create_manifest
-
-parser = argparse.ArgumentParser(description='Processes and downloads an4.')
-parser.add_argument('--target-dir', default='an4_dataset/', help='Path to save dataset')
-parser.add_argument('--min-duration', default=1, type=int,
-                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
-parser.add_argument('--max-duration', default=15, type=int,
-                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
-args = parser.parse_args()
-
-
-def _format_data(root_path, data_tag, name, wav_folder):
-    data_path = args.target_dir + data_tag + '/' + name + '/'
-    new_transcript_path = data_path + '/txt/'
-    new_wav_path = data_path + '/wav/'
-
-    os.makedirs(new_transcript_path)
-    os.makedirs(new_wav_path)
-
-    wav_path = root_path + 'wav/'
-    file_ids = root_path + 'etc/an4_%s.fileids' % data_tag
-    transcripts = root_path + 'etc/an4_%s.transcription' % data_tag
-    train_path = wav_path + wav_folder
-
-    _convert_audio_to_wav(train_path)
-    _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path)
-
-
-def _convert_audio_to_wav(train_path):
+import logging
+import sonosco.common.audio_tools as audio_tools
+import sonosco.common.path_utils as path_utils
+
+from sonosco.datasets.download_datasets.data_utils import create_manifest
+from sonosco.common.utils import setup_logging
+from sonosco.common.constants import *
+
+logger = logging.getLogger(__name__)
+
+AN4_URL = 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz'
+
+def try_download_an4(target_dir, sample_rate, min_duration, max_duration):
+    path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
+    if not os.path.exists(path_to_data):
+        os.makedirs(path_to_data)
+    target_unpacked_dir = os.path.join(path_to_data, "an4_unpacked")
+    path_utils.try_create_directory(target_unpacked_dir)
+
+    extracted_dir = os.path.join(path_to_data, "An4")
+    if os.path.exists(extracted_dir):
+        shutil.rmtree(extracted_dir)
+    logger.info("Start downloading...")
+    file_name = AN4_URL.split("/")[-1]
+
+    target_filename = os.path.join(target_unpacked_dir, file_name)
+    path_utils.try_download(target_filename, AN4_URL)
+    logger.info("Download complete")
+    logger.info("Unpacking...")
+    tar = tarfile.open(target_filename)
+    tar.extractall(extracted_dir)
+    tar.close()
+    assert os.path.exists(extracted_dir), f"Archive {file_name} was not properly uncompressed"
+    logger.info("Converting files to wav and extracting transcripts...")
+
+    create_wav_and_transcripts(path_to_data, 'train', sample_rate, extracted_dir, 'an4_clstk')
+    create_wav_and_transcripts(path_to_data, 'test', sample_rate, extracted_dir, 'an4test_clstk')
+
+    create_manifest(path_to_data, os.path.join(path_to_data,'an4_train_manifest.csv'), min_duration, max_duration)
+    create_manifest(path_to_data, os.path.join(path_to_data,'an4_val_manifest.csv'), min_duration, max_duration)
+
+def create_wav_and_transcripts(path, data_tag, sample_rate, extracted_dir, wav_subfolder_name):
+    tag_path = os.path.join(path,data_tag)
+    transcript_path_new = os.path.join(tag_path, 'txt')
+    wav_path_new = os.path.join(tag_path, 'wav')
+
+    path_utils.try_create_directory(transcript_path_new)
+    path_utils.try_create_directory(wav_path_new)
+
+    wav_path_ext = os.path.join(extracted_dir, 'an4/wav')
+    file_ids = os.path.join(extracted_dir, f'an4/etc/an4_{data_tag}.fileids')
+    transcripts_ext = os.path.join(extracted_dir, f'an4/etc/an4_{data_tag}.transcription')
+    path = os.path.join(wav_path_ext, wav_subfolder_name)
+    convert_audio_to_wav(path, sample_rate)
+    format_files(file_ids, transcript_path_new, wav_path_new, transcripts_ext, wav_path_ext)
+
+def convert_audio_to_wav(train_path, sample_rate):
     with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe:
         for line in pipe:
             raw_path = line.strip()
             new_path = line.replace('.raw', '.wav').strip()
-            cmd = 'sox -t raw -r %d -b 16 -e signed-integer -B -c 1 \"%s\" \"%s\"' % (
-                16000, raw_path, new_path)
-            os.system(cmd)
+            audio_tools.transcode_recordings_an4(raw_path=raw_path, wav_path= new_path, sample_rate=sample_rate)
 
 
-def _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path):
+def format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path):
     with open(file_ids, 'r') as f:
         with open(transcripts, 'r') as t:
             paths = f.readlines()
             transcripts = t.readlines()
             for x in range(len(paths)):
-                path = wav_path + paths[x].strip() + '.wav'
+                path = os.path.join(wav_path, paths[x].strip()) + '.wav'
                 filename = path.split('/')[-1]
                 extracted_transcript = _process_transcript(transcripts, x)
                 current_path = os.path.abspath(path)
-                new_path = new_wav_path + filename
-                text_path = new_transcript_path + filename.replace('.wav', '.txt')
+                new_path = os.path.join(new_wav_path ,filename)
+                text_path = os.path.join(new_transcript_path,filename.replace('.wav', '.txt'))
                 with io.FileIO(text_path, "w") as file:
                     file.write(extracted_transcript.encode('utf-8'))
                 os.rename(current_path, new_path)
@@ -64,23 +88,20 @@ def _process_transcript(transcripts, x):
     extracted_transcript = transcripts[x].split('(')[0].strip("<s>").split('<')[0].strip().upper()
     return extracted_transcript
 
-
-def main():
-    root_path = 'an4/'
-    name = 'an4'
-    wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
-    tar = tarfile.open('an4_raw.bigendian.tar.gz')
-    tar.extractall()
-    os.makedirs(args.target_dir)
-    _format_data(root_path, 'train', name, 'an4_clstk')
-    _format_data(root_path, 'test', name, 'an4test_clstk')
-    shutil.rmtree(root_path)
-    os.remove('an4_raw.bigendian.tar.gz')
-    train_path = args.target_dir + '/train/'
-    test_path = args.target_dir + '/test/'
-    print ('\n', 'Creating manifests...')
-    create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration)
-    create_manifest(test_path, 'an4_val_manifest.csv')
+@click.command()
+@click.option("--target-dir", default="temp/data/an4", type=str, help="Directory to store the dataset.")
+@click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
+@click.option("--min-duration", default=1, type=int,
+              help="Prunes training samples shorter than the min duration (given in seconds).")
+@click.option("--max-duration", default=15, type=int,
+              help="Prunes training samples longer than the max duration (given in seconds).")
+
+def main(**kwargs):
+    """Processes and downloads an4 dataset."""
+    global logger
+    logger = logging.getLogger(SONOSCO)
+    setup_logging(logger)
+    try_download_an4(**kwargs)
 
 
 if __name__ == '__main__':

From b93f7ee154221e23ff9201f955e8aed3b33bc5ce Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Wed, 12 Jun 2019 11:14:21 +0200
Subject: [PATCH 30/58] add global logger

---
 sonosco/datasets/download_datasets/common_voice.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py
index 7ea1eff..1c66e4f 100644
--- a/sonosco/datasets/download_datasets/common_voice.py
+++ b/sonosco/datasets/download_datasets/common_voice.py
@@ -94,6 +94,7 @@ def process(x):
             pool.map(process, data)
 
 def main(**kwargs):
+    global logger
     logger = logging.getLogger(SONOSCO)
     setup_logging(logger)
     try_download_common_voice(**kwargs)

From 31ae0266730d9d99f4ebb323fe24e4404e968513 Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Wed, 12 Jun 2019 11:14:44 +0200
Subject: [PATCH 31/58] add transcription function for ted3

---
 sonosco/common/audio_tools.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
index 77befb6..cd7a2bc 100644
--- a/sonosco/common/audio_tools.py
+++ b/sonosco/common/audio_tools.py
@@ -9,4 +9,7 @@ def transcode_recording(source, destination, sample_rate):
     subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination}"], shell=True)
 
 def transcode_recordings_an4(raw_path, wav_path, sample_rate):
-    subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True)
\ No newline at end of file
+    subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True)
+
+def transcode_recordings_ted3(source, destination, start_time, end_time, sample_rate):
+    subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True)
\ No newline at end of file

From 459eef404b716b759ab142a23b8547eaa0117efc Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Wed, 12 Jun 2019 11:15:09 +0200
Subject: [PATCH 32/58] adapt download script for ted3 to click and new scheme

---
 sonosco/datasets/download_datasets/ted3.py | 109 +++++++++++----------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/sonosco/datasets/download_datasets/ted3.py b/sonosco/datasets/download_datasets/ted3.py
index c7d4b3e..b50cdd6 100644
--- a/sonosco/datasets/download_datasets/ted3.py
+++ b/sonosco/datasets/download_datasets/ted3.py
@@ -1,25 +1,49 @@
 import os
-import wget
-import tarfile
+import click
+import logging
 import argparse
 import subprocess
 import unicodedata
+import tarfile
 import io
-from utils import create_manifest
+import shutil
+import sonosco.common.audio_tools as audio_tools
+import sonosco.common.path_utils as path_utils
+from sonosco.datasets.download_datasets.data_utils import create_manifest
+from sonosco.common.utils import setup_logging
+from sonosco.common.constants import *
 from tqdm import tqdm
 
-parser = argparse.ArgumentParser(description='Processes and downloads TED-LIUMv3 dataset.')
-parser.add_argument("--target-dir", default='TEDLIUM3_dataset/', type=str, help="Directory to store the dataset.")
-parser.add_argument("--tar-path", type=str, help="Path to the TEDLIUM_release tar if downloaded (Optional).")
-parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
-parser.add_argument('--min-duration', default=1, type=int,
-                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
-parser.add_argument('--max-duration', default=15, type=int,
-                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
-args = parser.parse_args()
+logger = logging.getLogger(__name__)
 
 TED_LIUM_V2_DL_URL = "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz"
 
+def try_download_ted3(target_dir, sample_rate, min_duration, max_duration):
+    path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
+    path_utils.try_create_directory(path_to_data)
+
+    target_unpacked_dir = os.path.join(path_to_data, "ted3_unpacked")
+    path_utils.try_create_directory(target_unpacked_dir)
+
+    extracted_dir = os.path.join(path_to_data, "Ted3")
+    if os.path.exists(extracted_dir):
+        shutil.rmtree(extracted_dir)
+    logger.info("Start downloading...")
+    file_name = TED_LIUM_V2_DL_URL.split("/")[-1]
+    target_filename = os.path.join(target_unpacked_dir, file_name)
+    path_utils.try_download(target_filename, TED_LIUM_V2_DL_URL)
+
+    logger.info("Download complete")
+    logger.info("Unpacking...")
+    tar = tarfile.open(target_filename)
+    tar.extractall(extracted_dir)
+    tar.close()
+    os.remove(target_unpacked_dir)
+    assert os.path.exists(extracted_dir), f"Archive {file_name} was not properly uncompressed"
+    logger.info("Converting files to wav and extracting transcripts...")
+    prepare_dir(path_to_data, sample_rate)
+    create_manifest(path_to_data, os.path.join(path_to_data,'ted3_train_manifest.csv'), min_duration, max_duration)
+
 
 def get_utterances_from_stm(stm_file):
     """
@@ -45,12 +69,6 @@ def get_utterances_from_stm(stm_file):
         return res
 
 
-def cut_utterance(src_sph_file, target_wav_file, start_time, end_time, sample_rate=16000):
-    subprocess.call(["sox {}  -r {} -b 16 -c 1 {} trim {} ={}".format(src_sph_file, str(sample_rate),
-                                                                      target_wav_file, start_time, end_time)],
-                    shell=True)
-
-
 def _preprocess_transcript(phrase):
     return phrase.strip().upper()
 
@@ -59,15 +77,12 @@ def filter_short_utterances(utterance_info, min_len_sec=1.0):
     return utterance_info["end_time"] - utterance_info["start_time"] > min_len_sec
 
 
-def prepare_dir(ted_dir):
-    converted_dir = os.path.join(ted_dir, "converted")
+def prepare_dir(ted_dir, sample_rate):
     # directories to store converted wav files and their transcriptions
-    wav_dir = os.path.join(converted_dir, "wav")
-    if not os.path.exists(wav_dir):
-        os.makedirs(wav_dir)
-    txt_dir = os.path.join(converted_dir, "txt")
-    if not os.path.exists(txt_dir):
-        os.makedirs(txt_dir)
+    wav_dir = os.path.join(ted_dir, "wav")
+    path_utils.try_create_directory(wav_dir)
+    txt_dir = os.path.join(ted_dir, "txt")
+    path_utils.try_create_directory(txt_dir)
     counter = 0
     entries = os.listdir(os.path.join(ted_dir, "sph"))
     for sph_file in tqdm(entries, total=len(entries)):
@@ -83,41 +98,27 @@ def prepare_dir(ted_dir):
         for utterance_id, utterance in enumerate(all_utterances):
             target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(utterance["filename"], str(utterance_id)))
             target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(utterance["filename"], str(utterance_id)))
-            cut_utterance(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"],
-                          sample_rate=args.sample_rate)
+            audio_tools.transcode_recordings_ted3(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"],
+                          sample_rate=sample_rate)
             with io.FileIO(target_txt_file, "w") as f:
                 f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8'))
         counter += 1
 
+@click.command()
+@click.option("--target-dir", default="temp/data/ted3", type=str, help="Directory to store the dataset.")
+@click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
 
-def main():
-    target_dl_dir = args.target_dir
-    #if not os.path.exists(target_dl_dir):
-    #    os.makedirs(target_dl_dir)
-
-    target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release-3")
-    #if args.tar_path and os.path.exists(args.tar_path):
-    #    target_file = args.tar_path
-    #else:
-    #    print("Could not find downloaded TEDLIUM archive, Downloading corpus...")
-    #    wget.download(TED_LIUM_V2_DL_URL, target_dl_dir)
-    #    target_file = os.path.join(target_dl_dir, "TEDLIUM_release-3.tgz")
-    #if not os.path.exists(target_unpacked_dir):
-    #    print("Unpacking corpus...")
-    #    tar = tarfile.open(target_file)
-    #    tar.extractall(target_dl_dir)
-    #    tar.close()
-    #else:
-    #    print("Found TEDLIUM directory, skipping unpacking of tar files")
-
-    train_ted_dir = os.path.join(target_unpacked_dir, "data")
-    train_ted_dir = os.path.join(train_ted_dir, "converted")
-
-    #prepare_dir(train_ted_dir)
-    print('Creating manifests...')
+@click.option("--min-duration", default=1, type=int,
+              help="Prunes training samples shorter than the min duration (given in seconds).")
+@click.option("--max-duration", default=15, type=int,
+              help="Prunes training samples longer than the max duration (given in seconds).")
 
-    create_manifest(train_ted_dir, 'ted3_train_manifest.csv', args.min_duration, args.max_duration)
 
+def main(**kwargs):
+    global logger
+    logger = logging.getLogger(SONOSCO)
+    setup_logging(logger)
+    try_download_ted3(**kwargs)
 
 if __name__ == "__main__":
     main()

From 4ac59e99b396e6e2730d4568d45547d81ee75880 Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Wed, 12 Jun 2019 11:32:45 +0200
Subject: [PATCH 33/58] adapt voxforge to click and adapt to datapaths

---
 .../datasets/download_datasets/voxforge.py    | 71 +++++++++++--------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/sonosco/datasets/download_datasets/voxforge.py b/sonosco/datasets/download_datasets/voxforge.py
index a31febf..6317354 100644
--- a/sonosco/datasets/download_datasets/voxforge.py
+++ b/sonosco/datasets/download_datasets/voxforge.py
@@ -1,4 +1,6 @@
 import os
+import click
+import logging
 from six.moves import urllib
 import argparse
 import re
@@ -7,22 +9,29 @@
 import subprocess
 import tarfile
 import io
+import sonosco.common.audio_tools as audio_tools
+import sonosco.common.path_utils as path_utils
+from sonosco.datasets.download_datasets.data_utils import create_manifest
+from sonosco.common.utils import setup_logging
+from sonosco.common.constants import *
 from tqdm import tqdm
 
-from utils import create_manifest
+logger = logging.getLogger(__name__)
 
 VOXFORGE_URL_16kHz = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'
 
-parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.')
-parser.add_argument("--target-dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.")
-parser.add_argument('--sample-rate', default=16000,
-                    type=int, help='Sample rate')
-parser.add_argument('--min-duration', default=1, type=int,
-                    help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
-parser.add_argument('--max-duration', default=15, type=int,
-                    help='Prunes training samples longer than the max duration (given in seconds, default 15)')
-args = parser.parse_args()
+def try_download_voxforge(target_dir, sample_rate, min_duration, max_duration):
+    path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
+    path_utils.try_create_directory(path_to_data)
 
+    logger.info("Start downloading...")
+    request = urllib.request.Request(VOXFORGE_URL_16kHz)
+    response = urllib.request.urlopen(request)
+    content = response.read()
+    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
+    for f in tqdm(all_files, total=len(all_files)):
+        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, path_to_data, sample_rate)
+    create_manifest(path_to_data, os.path.join(path_to_data,'voxforge_train_manifest.csv'), min_duration, max_duration)
 
 def _get_recordings_dir(sample_dir, recording_name):
     wav_dir = os.path.join(sample_dir, recording_name, "wav")
@@ -34,16 +43,14 @@ def _get_recordings_dir(sample_dir, recording_name):
     raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name))
 
 
-def prepare_sample(recording_name, url, target_folder):
+def prepare_sample(recording_name, url, target_folder, sample_rate):
     """
     Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder.
     """
     wav_dir = os.path.join(target_folder, "wav")
-    if not os.path.exists(wav_dir):
-        os.makedirs(wav_dir)
+    path_utils.try_create_directory(wav_dir)
     txt_dir = os.path.join(target_folder, "txt")
-    if not os.path.exists(txt_dir):
-        os.makedirs(txt_dir)
+    path_utils.try_create_directory(txt_dir)
     # check if sample is processed
     filename_set = set(['_'.join(wav_file.split('_')[:-1]) for wav_file in os.listdir(wav_dir)])
     if recording_name in filename_set:
@@ -80,23 +87,27 @@ def prepare_sample(recording_name, url, target_folder):
                 with io.FileIO(target_txt_file, "w") as file:
                     file.write(utterance.encode('utf-8'))
                 original_wav_file = os.path.join(recordings_dir, wav_file)
-                subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
-                                                                      target_wav_file)], shell=True)
+                audio_tools.transcode_recording(original_wav_file, target_wav_file, sample_rate)
 
         shutil.rmtree(dirpath)
 
+@click.command()
+@click.option("--target-dir", default="temp/data/voxforge", type=str, help="Directory to store the dataset.")
+@click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
 
-if __name__ == '__main__':
-    target_dir = args.target_dir
-    sample_rate = args.sample_rate
+@click.option("--min-duration", default=1, type=int,
+              help="Prunes training samples shorter than the min duration (given in seconds).")
+@click.option("--max-duration", default=15, type=int,
+              help="Prunes training samples longer than the max duration (given in seconds).")
 
-    if not os.path.isdir(target_dir):
-        os.makedirs(target_dir)
-    request = urllib.request.Request(VOXFORGE_URL_16kHz)
-    response = urllib.request.urlopen(request)
-    content = response.read()
-    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
-    for f in tqdm(all_files, total=len(all_files)):
-        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir)
-    print('Creating manifests...')
-    create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration)
\ No newline at end of file
+
+
+def main(**kwargs):
+    global logger
+    logger = logging.getLogger(SONOSCO)
+    setup_logging(logger)
+    try_download_voxforge(**kwargs)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 4cefe935ae67e8c492569208a2230a09601f00a7 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Wed, 12 Jun 2019 18:22:33 +0200
Subject: [PATCH 34/58] add tests

---
 requirements.txt                              |  1 +
 sonosco/common/click_extensions.py            |  4 +-
 ...udioDataLoader.py => audio_data_loader.py} |  0
 ...ioDataSampler.py => audio_data_sampler.py} |  0
 .../{AudioDataset.py => audio_dataset.py}     | 37 ++-----------
 .../download_datasets/common_voice.py         |  8 +--
 .../datasets/download_datasets/data_utils.py  | 10 ++--
 .../datasets/download_datasets/librispeech.py | 19 +++----
 tests/test_dataset.py                         | 53 +++++++++++++++++++
 9 files changed, 79 insertions(+), 53 deletions(-)
 rename sonosco/datasets/{AudioDataLoader.py => audio_data_loader.py} (100%)
 rename sonosco/datasets/{AudioDataSampler.py => audio_data_sampler.py} (100%)
 rename sonosco/datasets/{AudioDataset.py => audio_dataset.py} (78%)
 create mode 100644 tests/test_dataset.py

diff --git a/requirements.txt b/requirements.txt
index 46e6bd6..3547e7e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@ torchvision==0.3.0
 tqdm==4.32.1
 pyyaml==5.1
 wget==3.2
+pytest
\ No newline at end of file
diff --git a/sonosco/common/click_extensions.py b/sonosco/common/click_extensions.py
index 18ff96f..3554572 100644
--- a/sonosco/common/click_extensions.py
+++ b/sonosco/common/click_extensions.py
@@ -3,7 +3,7 @@
 import logging
 
 
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
 
 
 class PythonLiteralOption(click.Option):
@@ -12,5 +12,5 @@ def type_cast_value(self, ctx, value):
         try:
             return ast.literal_eval(value)
         except Exception as e:
-            logger.error(f"Malformed click input for PythonLiteralOption {e}", exc_info=True)
+            LOGGER.error(f"Malformed click input for PythonLiteralOption {e}", exc_info=True)
             raise click.BadParameter(value)
diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/audio_data_loader.py
similarity index 100%
rename from sonosco/datasets/AudioDataLoader.py
rename to sonosco/datasets/audio_data_loader.py
diff --git a/sonosco/datasets/AudioDataSampler.py b/sonosco/datasets/audio_data_sampler.py
similarity index 100%
rename from sonosco/datasets/AudioDataSampler.py
rename to sonosco/datasets/audio_data_sampler.py
diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/audio_dataset.py
similarity index 78%
rename from sonosco/datasets/AudioDataset.py
rename to sonosco/datasets/audio_dataset.py
index 098d195..8677396 100644
--- a/sonosco/datasets/AudioDataset.py
+++ b/sonosco/datasets/audio_dataset.py
@@ -16,7 +16,7 @@
 from sonosco.common.constants import *
 
 
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
 
 
 class DataProcessor:
@@ -85,10 +85,10 @@ def parse_audio(self, audio_path):
     def parse_transcript(self, transcript_path):
         with open(transcript_path, 'r', encoding='utf8') as transcript_file:
             transcript = transcript_file.read().replace('\n', '')
-            logger.info(f"1: {transcript}")
+            LOGGER.info(f"1: {transcript}")
         # TODO: Is it fast enough?
         transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
-        logger.info(f"transcript_path: {transcript_path} transcript: {transcript}")
+        LOGGER.info(f"transcript_path: {transcript_path} transcript: {transcript}")
         return transcript
 
 
@@ -103,7 +103,7 @@ def __init__(self, processor: DataProcessor, manifest_filepath):
         :param processor: Data processor object
         :param manifest_filepath: Path to manifest csv as describe above
         """
-        super(AudioDataset, self).__init__()
+        super().__init__()
         with open(manifest_filepath) as f:
             ids = f.readlines()
         ids = [x.strip().split(',') for x in ids]
@@ -122,32 +122,3 @@ def __getitem__(self, index):
 
     def __len__(self):
         return self.size
-
-
-def main():
-    global logger
-    logger = logging.getLogger(SONOSCO)
-    setup_logging(logger)
-
-    # create data processor
-    audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01,
-                      labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False)
-    processor = DataProcessor(**audio_conf)
-
-    # get manifest file
-    manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech")
-    test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
-
-    # create audio dataset
-    test_dataset = AudioDataset(processor, manifest_filepath=test_manifest)
-    logger.info("Dataset is created")
-    test = test_dataset[0]
-    batch_size = 16
-    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
-    dataloader = DataLoader(dataset=test_dataset, num_workers=4, collate_fn=_collate_fn, batch_sampler=sampler)
-    test_dataset[0]
-    #inputs, targets, input_percentages, target_sizes = next(iter(dataloader))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py
index 70451ab..3b75e52 100644
--- a/sonosco/datasets/download_datasets/common_voice.py
+++ b/sonosco/datasets/download_datasets/common_voice.py
@@ -12,7 +12,7 @@
 from sonosco.datasets.download_datasets.data_utils import create_manifest
 from sonosco.common.utils import setup_logging
 
-logger = logging.getLogger("sonosco")
+LOGGER = logging.getLogger("sonosco")
 
 COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz"
 
@@ -60,7 +60,7 @@ def process(x):
 
 
 def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
-    setup_logging(logger)
+    setup_logging(LOGGER)
 
     path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
     path_utils.try_create_directory(path_to_data)
@@ -74,8 +74,8 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration):
 
     path_utils.try_download(target_unpacked_dir, COMMON_VOICE_URL)
 
-    logger.info("Download complete")
-    logger.info("Unpacking...")
+    LOGGER.info("Download complete")
+    LOGGER.info("Unpacking...")
 
     print("Unpacking corpus to {} ...".format(target_unpacked_dir))
     tar = tarfile.open(target_unpacked_dir)
diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py
index e7e15d5..167eb92 100644
--- a/sonosco/datasets/download_datasets/data_utils.py
+++ b/sonosco/datasets/download_datasets/data_utils.py
@@ -7,15 +7,15 @@
 
 from tqdm import tqdm
 
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
 
 
 def create_manifest(data_path, output_path, min_duration=None, max_duration=None):
-    logger.info(f"Creating a manifest for path: {data_path}")
+    LOGGER.info(f"Creating a manifest for path: {data_path}")
     file_paths = [os.path.join(dirpath, f)
                   for dirpath, dirnames, files in os.walk(data_path)
                   for f in fnmatch.filter(files, '*.wav')]
-    logger.info(f"Found {len(file_paths)} .wav files")
+    LOGGER.info(f"Found {len(file_paths)} .wav files")
     file_paths = order_and_prune_files(file_paths, min_duration, max_duration)
     with io.FileIO(output_path, "w") as file:
         for wav_path in tqdm(file_paths, total=len(file_paths)):
@@ -25,11 +25,11 @@ def create_manifest(data_path, output_path, min_duration=None, max_duration=None
 
 
 def order_and_prune_files(file_paths, min_duration, max_duration):
-    logger.info("Sorting manifests...")
+    LOGGER.info("Sorting manifests...")
     path_and_duration = [(path, audio_tools.get_duration(path)) for path in file_paths]
 
     if min_duration and max_duration:
-        logger.info(f"Pruning manifests between {min_duration} and {max_duration} seconds")
+        LOGGER.info(f"Pruning manifests between {min_duration} and {max_duration} seconds")
         path_and_duration = [(path, duration) for path, duration in path_and_duration
                              if min_duration <= duration <= max_duration]
 
diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index 59df937..d4fb0e7 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -12,7 +12,7 @@
 from tqdm import tqdm
 
 
-logger = logging.getLogger(__name__)
+LOGGER = logging.getLogger(__name__)
 
 
 LIBRI_SPEECH_URLS = {
@@ -52,28 +52,29 @@ def try_download_librispeech(target_dir, sample_rate, files_to_use, min_duration
                 if url.find(f) != -1:
                     dl_flag = True
             if not dl_flag:
-                logger.info(f"Skipping url: {url}")
+                LOGGER.info(f"Skipping url: {url}")
                 continue
 
             filename = url.split("/")[-1]
             target_filename = os.path.join(split_dir, filename)
+            LOGGER.info(f"Downloading from {url}")
             path_utils.try_download(target_filename, url)
-            logger.info("Download complete")
-            logger.info(f"Unpacking {filename}...")
+            LOGGER.info("Download complete")
+            LOGGER.info(f"Unpacking {filename}...")
             tar = tarfile.open(target_filename)
             tar.extractall(split_dir)
             tar.close()
             os.remove(target_filename)
             assert os.path.exists(extracted_dir), f"Archive {filename} was not properly uncompressed"
 
-            logger.info("Converting flac files to wav and extracting transcripts...")
+            LOGGER.info("Converting flac files to wav and extracting transcripts...")
             for root, subdirs, files in tqdm(os.walk(extracted_dir)):
                 for f in files:
                     if f.find(".flac") != -1:
                         _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir,
                                       base_filename=f, root_dir=root, sample_rate=sample_rate)
 
-            logger.info(f"Finished {url}")
+            LOGGER.info(f"Finished {url}")
             shutil.rmtree(extracted_dir)
 
         manifest_path = os.path.join(path_to_data, f"libri_{split_type}_manifest.csv")
@@ -121,9 +122,9 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate):
               help="Prunes training samples longer than the max duration (given in seconds).")
 def main(**kwargs):
     """Processes and downloads LibriSpeech dataset."""
-    global logger
-    logger = logging.getLogger(SONOSCO)
-    setup_logging(logger)
+    global LOGGER
+    LOGGER = logging.getLogger(SONOSCO)
+    setup_logging(LOGGER)
     try_download_librispeech(**kwargs)
 
 
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
new file mode 100644
index 0000000..977b549
--- /dev/null
+++ b/tests/test_dataset.py
@@ -0,0 +1,53 @@
+import logging
+import os
+
+from sonosco.common.constants import SONOSCO
+from sonosco.datasets.audio_dataset import AudioDataset, DataProcessor
+from sonosco.datasets.audio_data_sampler import BucketingSampler
+from sonosco.datasets.audio_data_loader import DataLoader
+from sonosco.datasets.download_datasets.librispeech import try_download_librispeech
+
+
+LOGGER = logging.getLogger(SONOSCO)
+LIBRI_SPEECH_DIR = "temp/test_data/libri_speech"
+
+
+def test_librispeech_download():
+    # prepare
+    if os.path.exists(LIBRI_SPEECH_DIR):
+        os.removedirs(LIBRI_SPEECH_DIR)
+
+    # get manifest file
+    manifest_directory = os.path.join(os.path.expanduser("~"), LIBRI_SPEECH_DIR)
+    test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
+
+    if not os.path.exists(test_manifest):
+        try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15)
+
+    assert os.path.exists(test_manifest)
+
+
+def test_librispeech_clean():
+    # create data processor
+    audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01,
+                      labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False)
+    processor = DataProcessor(**audio_conf)
+
+    # get manifest file
+    manifest_directory = os.path.join(os.path.expanduser("~"), LIBRI_SPEECH_DIR)
+    test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
+
+    if not os.path.exists(test_manifest):
+        try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15)
+
+    assert os.path.exists(test_manifest)
+
+    # create audio dataset
+    test_dataset = AudioDataset(processor, manifest_filepath=test_manifest)
+    LOGGER.info("Dataset is created")
+    test = test_dataset[0]
+    batch_size = 16
+    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
+    dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler)
+    test_dataset[0]
+

From 2e8de3abab3fc1e5195ab8a488e0d11a8eaa78dd Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sun, 16 Jun 2019 12:39:08 +0200
Subject: [PATCH 35/58] add augmentation utils

---
 sonosco/common/audio_tools.py     | 21 ++++++++++++++++++++-
 sonosco/datasets/audio_dataset.py | 22 ++++++++++++----------
 tests/test_dataset.py             |  2 +-
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
index cd7a2bc..4cd1ac8 100644
--- a/sonosco/common/audio_tools.py
+++ b/sonosco/common/audio_tools.py
@@ -1,4 +1,6 @@
 import subprocess
+import numpy as np
+import librosa
 
 
 def get_duration(file_path):
@@ -8,8 +10,25 @@ def get_duration(file_path):
 def transcode_recording(source, destination, sample_rate):
     subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination}"], shell=True)
 
+
 def transcode_recordings_an4(raw_path, wav_path, sample_rate):
     subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True)
 
+
 def transcode_recordings_ted3(source, destination, start_time, end_time, sample_rate):
-    subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True)
\ No newline at end of file
+    subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True)
+
+
+def add_noise(audio, std=0.005):
+    noise = np.random.randn(len(audio))
+    data_noise = audio + std * noise
+    return data_noise
+
+
+def shift(audio, n_samples=1600):
+    return np.roll(audio, n_samples)
+
+
+def stretch(audio, rate=1):
+    stretched_audio = librosa.effects.time_stretch(audio, rate)
+    return stretched_audio
diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py
index 8677396..3d7538b 100644
--- a/sonosco/datasets/audio_dataset.py
+++ b/sonosco/datasets/audio_dataset.py
@@ -3,17 +3,14 @@
 # https://github.com/SeanNaren/deepspeech.pytorch
 # ----------------------------------------------------------------------------
 
-import warnings
-import os
 import logging
 import torch
 import torchaudio
+import librosa
 import sonosco.config.global_settings as global_settings
+import sonosco.common.audio_tools as audio_tools
 
-from typing import Tuple
 from torch.utils.data import Dataset
-from sonosco.common.utils import setup_logging
-from sonosco.common.constants import *
 
 
 LOGGER = logging.getLogger(__name__)
@@ -55,10 +52,12 @@ def retrieve_file(audio_path):
         return sound, sample_rate
 
     @staticmethod
-    def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)):
-        """Changes tempo and gain of the wave."""
-        warnings.warn("Augmentation is not implemented")  # TODO: Implement
-        return sound
+    def augment_audio(sound):
+        sound_array = sound.numpy().squeeze()
+        stretched_sound = audio_tools.stretch(sound_array, 0.5)
+        import pdb; pdb.set_trace()
+        stretched_sound = audio_tools.shift(stretched_sound, 4000)
+        return torch.from_numpy(stretched_sound)
 
     def parse_audio(self, audio_path):
         sound, sample_rate = self.retrieve_file(audio_path)
@@ -66,8 +65,11 @@ def parse_audio(self, audio_path):
         if sample_rate != self.sample_rate:
             raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
 
+        librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound.numpy().transpose(), sample_rate)
+
         if self.augment:
-            sound = self.augment_audio(sound)
+            stretched_sound = self.augment_audio(sound)
+            librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound.numpy().transpose(), sample_rate)
 
         if global_settings.CUDA_ENABLED:
             sound = sound.cuda()
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 977b549..359844f 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -30,7 +30,7 @@ def test_librispeech_download():
 def test_librispeech_clean():
     # create data processor
     audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01,
-                      labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False)
+                      labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True)
     processor = DataProcessor(**audio_conf)
 
     # get manifest file

From 420608068f7622d378a7bd9dd84460c04156e281 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sun, 16 Jun 2019 13:52:23 +0200
Subject: [PATCH 36/58] change to librosa

---
 sonosco/datasets/audio_dataset.py | 32 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py
index 3d7538b..a38d0fd 100644
--- a/sonosco/datasets/audio_dataset.py
+++ b/sonosco/datasets/audio_dataset.py
@@ -5,8 +5,8 @@
 
 import logging
 import torch
-import torchaudio
 import librosa
+import numpy as np
 import sonosco.config.global_settings as global_settings
 import sonosco.common.audio_tools as audio_tools
 
@@ -46,18 +46,15 @@ def window_stride_samples(self):
     def window_size_samples(self):
         return int(self.sample_rate * self.window_stride)
 
-    @staticmethod
-    def retrieve_file(audio_path):
-        sound, sample_rate = torchaudio.load(audio_path)
+    def retrieve_file(self, audio_path):
+        sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate)
         return sound, sample_rate
 
     @staticmethod
     def augment_audio(sound):
-        sound_array = sound.numpy().squeeze()
-        stretched_sound = audio_tools.stretch(sound_array, 0.5)
-        import pdb; pdb.set_trace()
-        stretched_sound = audio_tools.shift(stretched_sound, 4000)
-        return torch.from_numpy(stretched_sound)
+        augmented = audio_tools.stretch(sound, 0.5)
+        augmented = audio_tools.shift(augmented, 4000)
+        return augmented
 
     def parse_audio(self, audio_path):
         sound, sample_rate = self.retrieve_file(audio_path)
@@ -65,22 +62,23 @@ def parse_audio(self, audio_path):
         if sample_rate != self.sample_rate:
             raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
 
-        librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound.numpy().transpose(), sample_rate)
+        librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound, sample_rate)
 
         if self.augment:
             stretched_sound = self.augment_audio(sound)
-            librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound.numpy().transpose(), sample_rate)
+            librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound, sample_rate)
 
         if global_settings.CUDA_ENABLED:
             sound = sound.cuda()
 
         # TODO: comment why take the last element?
-        spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()),
-                                 n_fft=self.window_size_samples,
-                                 hop_length=self.window_stride_samples,
-                                 win_length=self.window_size_samples,
-                                 window=torch.hamming_window(self.window_size_samples),
-                                 normalized=self.normalize)[:, :, -1]
+        D = librosa.stft(sound,
+                         n_fft=self.window_size_samples,
+                         hop_length=self.window_stride_samples,
+                         win_length=self.window_size_samples)
+        spectrogram, phase = librosa.magphase(D)
+        # S = log(S+1)
+        spectrogram = torch.from_numpy(np.log1p(spectrogram))
 
         return spectrogram
 

From 7f4b362376022f0c735e3e5257e0a659848938df Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sun, 16 Jun 2019 15:22:53 +0200
Subject: [PATCH 37/58] add augmentation

---
 requirements.txt                              |  3 ++-
 sonosco/common/audio_tools.py                 |  7 ++++-
 sonosco/common/utils.py                       |  5 ++++
 sonosco/datasets/audio_data_sampler.py        |  1 +
 sonosco/datasets/audio_dataset.py             | 16 ++++++++---
 .../datasets/download_datasets/librispeech.py |  5 ++--
 tests/test_dataset.py                         | 27 ++++++++++++-------
 7 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3547e7e..eaaa70e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,5 @@ torchvision==0.3.0
 tqdm==4.32.1
 pyyaml==5.1
 wget==3.2
-pytest
\ No newline at end of file
+pytest
+click
\ No newline at end of file
diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
index 4cd1ac8..45367a6 100644
--- a/sonosco/common/audio_tools.py
+++ b/sonosco/common/audio_tools.py
@@ -19,7 +19,7 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_
     subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True)
 
 
-def add_noise(audio, std=0.005):
+def add_noise(audio, std=0.002):
     noise = np.random.randn(len(audio))
     data_noise = audio + std * noise
     return data_noise
@@ -32,3 +32,8 @@ def shift(audio, n_samples=1600):
 def stretch(audio, rate=1):
     stretched_audio = librosa.effects.time_stretch(audio, rate)
     return stretched_audio
+
+
+def pitch_shift(audio, sample_rate=16000, n_steps=3.0):
+    stretched_audio = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=n_steps)
+    return stretched_audio
diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py
index a570af0..5e86342 100644
--- a/sonosco/common/utils.py
+++ b/sonosco/common/utils.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import numpy as np
 
 
 def setup_logging(logger: logging.Logger, filename=None, verbosity=False):
@@ -19,3 +20,7 @@ def setup_logging(logger: logging.Logger, filename=None, verbosity=False):
     c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     c_handler.setFormatter(c_format)
     logger.addHandler(c_handler)
+
+
+def random_float(low: float, high: float):
+    return np.random.random() * (high - low) + low
diff --git a/sonosco/datasets/audio_data_sampler.py b/sonosco/datasets/audio_data_sampler.py
index 100bde7..b3bfc14 100644
--- a/sonosco/datasets/audio_data_sampler.py
+++ b/sonosco/datasets/audio_data_sampler.py
@@ -6,6 +6,7 @@
 from torch.distributed.deprecated import get_rank
 from torch.distributed.deprecated import get_world_size
 
+
 class BucketingSampler(Sampler):
     def __init__(self, data_source, batch_size=1):
         """
diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py
index a38d0fd..10a0a80 100644
--- a/sonosco/datasets/audio_dataset.py
+++ b/sonosco/datasets/audio_dataset.py
@@ -9,11 +9,17 @@
 import numpy as np
 import sonosco.config.global_settings as global_settings
 import sonosco.common.audio_tools as audio_tools
+import sonosco.common.utils as utils
 
 from torch.utils.data import Dataset
 
 
 LOGGER = logging.getLogger(__name__)
+MIN_STRETCH = 0.7
+MAX_STRETCH = 1.3
+MIN_PITCH = 0.5
+MAX_PITCH = 2.0
+MAX_SHIFT = 4000
 
 
 class DataProcessor:
@@ -50,10 +56,12 @@ def retrieve_file(self, audio_path):
         sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate)
         return sound, sample_rate
 
-    @staticmethod
-    def augment_audio(sound):
-        augmented = audio_tools.stretch(sound, 0.5)
-        augmented = audio_tools.shift(augmented, 4000)
+    def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True):
+        augmented = audio_tools.stretch(sound, utils.random_float(MIN_STRETCH, MAX_STRETCH)) if stretch else sound
+        augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented
+        augmented = audio_tools.pitch_shift(augmented, self.sample_rate,
+                                            n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented
+        augmented = audio_tools.add_noise(augmented) if noise else augmented
         return augmented
 
     def parse_audio(self, audio_path):
diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py
index d4fb0e7..246f0e2 100644
--- a/sonosco/datasets/download_datasets/librispeech.py
+++ b/sonosco/datasets/download_datasets/librispeech.py
@@ -122,11 +122,10 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate):
               help="Prunes training samples longer than the max duration (given in seconds).")
 def main(**kwargs):
     """Processes and downloads LibriSpeech dataset."""
-    global LOGGER
-    LOGGER = logging.getLogger(SONOSCO)
-    setup_logging(LOGGER)
     try_download_librispeech(**kwargs)
 
 
 if __name__ == "__main__":
+    LOGGER = logging.getLogger(SONOSCO)
+    setup_logging(LOGGER)
     main()
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 359844f..45db22c 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -1,18 +1,26 @@
 import logging
 import os
+import pytest
 
 from sonosco.common.constants import SONOSCO
+from sonosco.common.utils import setup_logging
 from sonosco.datasets.audio_dataset import AudioDataset, DataProcessor
 from sonosco.datasets.audio_data_sampler import BucketingSampler
 from sonosco.datasets.audio_data_loader import DataLoader
 from sonosco.datasets.download_datasets.librispeech import try_download_librispeech
 
 
-LOGGER = logging.getLogger(SONOSCO)
 LIBRI_SPEECH_DIR = "temp/test_data/libri_speech"
 
 
-def test_librispeech_download():
+@pytest.fixture
+def logger():
+    logger = logging.getLogger(SONOSCO)
+    setup_logging(logger)
+    return logger
+
+
+def test_librispeech_download(logger):
     # prepare
     if os.path.exists(LIBRI_SPEECH_DIR):
         os.removedirs(LIBRI_SPEECH_DIR)
@@ -22,12 +30,13 @@ def test_librispeech_download():
     test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
 
     if not os.path.exists(test_manifest):
+        logger.info("Starting to download dataset")
         try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15)
 
     assert os.path.exists(test_manifest)
 
 
-def test_librispeech_clean():
+def test_librispeech_clean(logger):
     # create data processor
     audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01,
                       labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True)
@@ -44,10 +53,10 @@ def test_librispeech_clean():
 
     # create audio dataset
     test_dataset = AudioDataset(processor, manifest_filepath=test_manifest)
-    LOGGER.info("Dataset is created")
-    test = test_dataset[0]
-    batch_size = 16
-    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
-    dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler)
-    test_dataset[0]
+    logger.info("Dataset is created")
+    test = test_dataset[4]
+    # batch_size = 16
+    # sampler = BucketingSampler(test_dataset, batch_size=batch_size)
+    # dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler)
+    # test_dataset[0]
 

From f96661f938bd4f0f8b07af161b0f9bb0c96c957a Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sun, 16 Jun 2019 15:27:02 +0200
Subject: [PATCH 38/58] remove test code

---
 sonosco/datasets/audio_dataset.py | 37 ++++++++++++++++++++-----------
 tests/test_dataset.py             | 23 ++++++++++++++++---
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py
index 10a0a80..0dea305 100644
--- a/sonosco/datasets/audio_dataset.py
+++ b/sonosco/datasets/audio_dataset.py
@@ -17,8 +17,8 @@
 LOGGER = logging.getLogger(__name__)
 MIN_STRETCH = 0.7
 MAX_STRETCH = 1.3
-MIN_PITCH = 0.5
-MAX_PITCH = 2.0
+MIN_PITCH = 0.7
+MAX_PITCH = 1.5
 MAX_SHIFT = 4000
 
 
@@ -64,27 +64,29 @@ def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True
         augmented = audio_tools.add_noise(augmented) if noise else augmented
         return augmented
 
-    def parse_audio(self, audio_path):
+    def parse_audio(self, audio_path, raw=False):
         sound, sample_rate = self.retrieve_file(audio_path)
 
         if sample_rate != self.sample_rate:
             raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
 
-        librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound, sample_rate)
-
         if self.augment:
-            stretched_sound = self.augment_audio(sound)
-            librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound, sample_rate)
+            sound = self.augment_audio(sound)
+
+        if raw:
+            return sound
+
+        sound_tensor = torch.from_numpy(sound)
 
         if global_settings.CUDA_ENABLED:
-            sound = sound.cuda()
+            sound_tensor = sound_tensor.cuda()
 
         # TODO: comment why take the last element?
-        D = librosa.stft(sound,
-                         n_fft=self.window_size_samples,
-                         hop_length=self.window_stride_samples,
-                         win_length=self.window_size_samples)
-        spectrogram, phase = librosa.magphase(D)
+        complex_spectrogram = librosa.stft(sound_tensor,
+                                           n_fft=self.window_size_samples,
+                                           hop_length=self.window_stride_samples,
+                                           win_length=self.window_size_samples)
+        spectrogram, phase = librosa.magphase(complex_spectrogram)
         # S = log(S+1)
         spectrogram = torch.from_numpy(np.log1p(spectrogram))
 
@@ -119,6 +121,15 @@ def __init__(self, processor: DataProcessor, manifest_filepath):
         self.size = len(ids)
         self.processor = processor
 
+    def get_raw(self, index):
+        sample = self.ids[index]
+        audio_path, transcript_path = sample[0], sample[1]
+
+        sound = self.processor.parse_audio(audio_path, raw=True)
+        transcript = self.processor.parse_transcript(transcript_path)
+
+        return sound, transcript
+
     def __getitem__(self, index):
         sample = self.ids[index]
         audio_path, transcript_path = sample[0], sample[1]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 45db22c..a80c5c9 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -1,6 +1,8 @@
 import logging
 import os
 import pytest
+import numpy as np
+import librosa
 
 from sonosco.common.constants import SONOSCO
 from sonosco.common.utils import setup_logging
@@ -11,6 +13,8 @@
 
 
 LIBRI_SPEECH_DIR = "temp/test_data/libri_speech"
+TEST_WAVS_DIR = "test_wavs"
+SAMPLE_RATE = 16000
 
 
 @pytest.fixture
@@ -38,7 +42,7 @@ def test_librispeech_download(logger):
 
 def test_librispeech_clean(logger):
     # create data processor
-    audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01,
+    audio_conf = dict(sample_rate=SAMPLE_RATE, window_size=.02, window_stride=.01,
                       labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True)
     processor = DataProcessor(**audio_conf)
 
@@ -47,14 +51,27 @@ def test_librispeech_clean(logger):
     test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
 
     if not os.path.exists(test_manifest):
-        try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15)
+        try_download_librispeech(LIBRI_SPEECH_DIR, SAMPLE_RATE, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15)
 
     assert os.path.exists(test_manifest)
 
     # create audio dataset
     test_dataset = AudioDataset(processor, manifest_filepath=test_manifest)
     logger.info("Dataset is created")
-    test = test_dataset[4]
+
+    if os.path.exists(TEST_WAVS_DIR):
+        os.removedirs(TEST_WAVS_DIR)
+
+    os.makedirs(TEST_WAVS_DIR)
+
+    n_samples = len(test_dataset)
+
+    ids = np.random.randint(n_samples, size=min(10, n_samples))
+
+    for index in ids:
+        sound, transcription = test_dataset.get_raw(index)
+        librosa.output.write_wav(os.path.join(TEST_WAVS_DIR, f"audio_{index}.wav"), sound, SAMPLE_RATE)
+
     # batch_size = 16
     # sampler = BucketingSampler(test_dataset, batch_size=batch_size)
     # dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler)

From 8d59622c84625adfce596cefe866697f814735a7 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sun, 16 Jun 2019 16:09:01 +0200
Subject: [PATCH 39/58] start adding experiment class

---
 sonosco/training/__init__.py   |  0
 sonosco/training/experiment.py | 90 ++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 sonosco/training/__init__.py
 create mode 100644 sonosco/training/experiment.py

diff --git a/sonosco/training/__init__.py b/sonosco/training/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sonosco/training/experiment.py b/sonosco/training/experiment.py
new file mode 100644
index 0000000..dd04c46
--- /dev/null
+++ b/sonosco/training/experiment.py
@@ -0,0 +1,90 @@
+import os
+import os.path as path
+import sys
+from time import time
+import datetime
+
+from .tee import Tee
+from .utils import *
+
+
+class Experiment:
+    """
+    Generates a folder where all experiments will be stored an then a named experiment with current
+    timestamp and provided name. Automatically starts logging the console output and creates a copy
+    of the currently executed code in the experiment folder. The experiment's subfolder paths are provided
+    to the outside as member variables. It also allows adding of more subfolders conveniently.
+    Args:
+        experiment_name (string): name of the exerpiment to be created
+        experiments_path (string): location where all experiments will be stored, default is './experiments'
+    Example:
+        >>> experiment = Experiment('mnist_classification')
+        >>> print(experiment.plots) # path to experiment plots
+    """
+
+    def __init__(self, experiment_name, experiments_path=None, exclude_dirs=[], exclude_files=[]):
+        self.experiments_path = self._set_experiments_dir(experiments_path)
+        self.name = self._set_experiment_name(experiment_name)
+        self.path = path.join(self.experiments_path, self.name) # path to current experiment
+        self._sub_directories = ['plots', 'logs', 'code'] # default sub-directories
+
+        self._exclude_dirs = ['__pycache__', '.git', 'experiments']
+        self._exclude_dirs.extend(exclude_dirs)
+        self._exclude_files = ['.pyc']
+        self._exclude_files.extend(exclude_files)
+
+
+        self._init_directories()
+        self._tee = Tee(path.join(self.logs, 'console_output.log'), 'w') # start to log console
+        self._copy_sourcecode()
+
+    def _set_experiments_dir(self, experiments_path):
+        if experiments_path != None:
+            return experiments_path
+        local_path = os.path.dirname(sys.argv[0])
+        local_path = local_path if local_path != '' else './'
+        return path.join(local_path, "experiments")
+
+    def _set_experiment_name(self, experiment_name):
+        date_time = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H:%M:%S')
+        return date_time + "_" + experiment_name
+
+    def _init_directories(self):
+        """ Create all basic directories. """
+        self._create_directory(self.experiments_path)
+        self._create_directory(path.join(self.experiments_path, self.name))
+        for sub_dir_name in self._sub_directories:
+            self.add_directory(sub_dir_name)
+
+    def _create_directory(self, dir_path):
+        if not path.exists(dir_path):
+            os.makedirs(dir_path)
+
+    def _add_member(self, key, value):
+        """ Add a member variable named 'key' with value 'value' to the experiment instance. """
+        self.__dict__[key] = value
+
+    def _copy_sourcecode(self):
+        """ Copy code from execution directory in experiment code directory. """
+        sources_path = os.path.dirname(sys.argv[0])
+        sources_path = sources_path if sources_path != '' else './'
+        copy_code(sources_path, self.code, exclude_dirs=self._exclude_dirs, exclude_files=self._exclude_files )# exclude_dirs=[path.basename(self.experiments_path), '.vscode', '.git'])
+
+    def add_directory(self, dir_name):
+        """
+        Add a sub-directory to the experiment. The directory will be automatically
+        created and provided to the outside as a member variable.
+        """
+        # store in sub-dir list
+        if not dir_name in self._sub_directories:
+            self._sub_directories.append(dir_name)
+        # add as member
+        dir_path = path.join(self.experiments_path, self.name, dir_name)
+        self._add_member(dir_name, dir_path)
+        # create directory
+        self._create_directory(dir_path)
+
+    def add_file(self, folder_path, filename, content):
+        """ Adds a file with provided content to folder. Convenience function. """
+        with open(path.join(folder_path, filename), 'w') as textfile:
+            textfile.write(content)
\ No newline at end of file

From 7174f9c843df7ce15287ec7da138bc928c1202b0 Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sun, 16 Jun 2019 17:11:44 +0200
Subject: [PATCH 40/58] Initial class structure and save methods

---
 requirements.txt |  3 +-
 sonosco/model.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 sonosco/model.py

diff --git a/requirements.txt b/requirements.txt
index 3547e7e..ae5ddac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,5 @@ torchvision==0.3.0
 tqdm==4.32.1
 pyyaml==5.1
 wget==3.2
-pytest
\ No newline at end of file
+pytest
+deprecation==2.0.6
\ No newline at end of file
diff --git a/sonosco/model.py b/sonosco/model.py
new file mode 100644
index 0000000..5305556
--- /dev/null
+++ b/sonosco/model.py
@@ -0,0 +1,90 @@
+import logging
+import torch
+import deprecation
+
+LOGGER = logging.getLogger(__name__)
+
+
+class Saver:
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    @deprecation.deprecated(
+        details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead")
+    def save_model_simple(self, model, path):
+        """
+       Simply saves the model using pickle protocol.
+        Args:
+            model: model to save
+            path (string) : path where to save the model
+
+        Returns:
+
+        """
+        torch.save(model, path)
+
+    def save_model(self, model, path, infer_structure=False, serialize_f_name='serialize'):
+        """
+        Saves the model using pickle protocol.
+
+        If the infer_structure is True this method infers all the meta parameters of the model and save them together
+        with learnable parameters.
+
+        If the infer_structure is False and method specified by serialize_f_name exists, the return value of the
+        serialize_f_name method is saved.
+
+        If neither of above only learnable parameters a.k.a. state_dict are saved.
+
+        Args:
+            model: model to save
+            path (string) : path where to save the model
+            infer_structure (bool): indicator whether to infer the model structure
+            serialize_f_name (string): name of the function that this method should call in order to serialize the model
+
+        Returns:
+
+        """
+        entity_to_save = None
+        if infer_structure:
+            entity_to_save = self.get_constructor_args_with_values(model)
+            entity_to_save['state_dict'] = model.state_dict()
+        elif hasattr(model, serialize_f_name) and callable(getattr(model, serialize_f_name)):
+            entity_to_save = getattr(model, serialize_f_name)()
+        else:
+            entity_to_save['state_dict'] = model.state_dict()
+
+        torch.save(entity_to_save, path)
+
+    @staticmethod
+    def get_constructor_args_with_values(model):
+        """
+        Assigns values to __init__ params names
+
+        For example:
+
+            class Bar():
+                def __init__(self, arg1, arg2):
+                    self.arg1 = arg1
+                    self.some_other_name = args2
+
+
+            bar = Bar("A","B")
+            get_constructor_args_with_values(bar)
+            # returns {arg1: arg1_val, arg2: arg2_val}
+
+
+        Args:
+            model: model to infre from
+
+        Returns (dict): Mapping from __init__ argument to it's value
+
+        """
+        return {}
+
+
+class Loader:
+
+    def load_model(self, cls, path):
+        package = torch.load(path, map_location=lambda storage, loc: storage)
+        cls()

From fb026367279db3f4f81e6a6772cd15dab068a2d3 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sun, 16 Jun 2019 18:48:55 +0200
Subject: [PATCH 41/58] resolve comments

---
 requirements.txt              |  4 ++--
 sonosco/common/audio_tools.py | 25 +++++++++++++++++++++----
 tests/test_dataset.py         |  2 +-
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index eaaa70e..97f2580 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,5 +20,5 @@ torchvision==0.3.0
 tqdm==4.32.1
 pyyaml==5.1
 wget==3.2
-pytest
-click
\ No newline at end of file
+pytest==4.6.3
+click==7.0
\ No newline at end of file
diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
index 45367a6..9ab77ab 100644
--- a/sonosco/common/audio_tools.py
+++ b/sonosco/common/audio_tools.py
@@ -19,10 +19,27 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_
     subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True)
 
 
-def add_noise(audio, std=0.002):
-    noise = np.random.randn(len(audio))
-    data_noise = audio + std * noise
-    return data_noise
+class NoiseMaker:
+
+    def __call__(self, audio):
+        """Adds noise to the audio signal."""
+        pass
+
+
+class GaussianNoiseMaker(NoiseMaker):
+
+    def __init__(self, std=0.002):
+        self.std = std
+
+    def __call__(self, audio):
+        noise = np.random.randn(len(audio))
+        return audio + self.std * noise
+
+
+def add_noise(audio, noise_maker: NoiseMaker = None):
+    if noise_maker is None:
+        noise_maker = GaussianNoiseMaker()
+    return noise_maker(audio)
 
 
 def shift(audio, n_samples=1600):
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index a80c5c9..ddc70e9 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -43,7 +43,7 @@ def test_librispeech_download(logger):
 def test_librispeech_clean(logger):
     # create data processor
     audio_conf = dict(sample_rate=SAMPLE_RATE, window_size=.02, window_stride=.01,
-                      labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True)
+                      labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False)
     processor = DataProcessor(**audio_conf)
 
     # get manifest file

From d82fa16ab56f4a3039c373ce0011df436409b42e Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sun, 16 Jun 2019 22:42:57 +0200
Subject: [PATCH 42/58] Added model loading and some utils

---
 sonosco/common/class_utils.py |  36 +++++++++++
 sonosco/model.py              | 112 ++++++++++++++++++++++++++++------
 2 files changed, 131 insertions(+), 17 deletions(-)
 create mode 100644 sonosco/common/class_utils.py

diff --git a/sonosco/common/class_utils.py b/sonosco/common/class_utils.py
new file mode 100644
index 0000000..b73cafa
--- /dev/null
+++ b/sonosco/common/class_utils.py
@@ -0,0 +1,36 @@
+import inspect
+from typing import List
+
+
+def get_constructor_args(cls: type) -> List[str]:
+    """
+    E.g.
+
+        class Bar():
+                def __init__(self, arg1, arg2):
+
+        get_constructor_args(BAR)
+        # returns ['arg1', 'arg2']
+    Args:
+        cls (type):
+
+    Returns: list containing names of constructor arguments
+
+    """
+    return inspect.getfullargspec(cls.__init__).args[1:]
+
+
+def get_class_by_name(name: str) -> type:
+    """
+    Returns type object of class specified by name
+    Args:
+        name: full name of the class (with packages)
+
+    Returns: class object
+
+    """
+    components = name.split('.')
+    mod = __import__(components[0])
+    for comp in components[1:]:
+        mod = getattr(mod, comp)
+    return mod
diff --git a/sonosco/model.py b/sonosco/model.py
index 5305556..5bbbdea 100644
--- a/sonosco/model.py
+++ b/sonosco/model.py
@@ -1,6 +1,10 @@
 import logging
 import torch
 import deprecation
+import inspect
+import torch.nn as nn
+
+from common.class_utils import get_constructor_args, get_class_by_name
 
 LOGGER = logging.getLogger(__name__)
 
@@ -12,52 +16,53 @@ def __init__(self) -> None:
 
     @deprecation.deprecated(
         details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead")
-    def save_model_simple(self, model, path):
+    def save_model_simple(self, model: nn.Module, path: str) -> None:
         """
        Simply saves the model using pickle protocol.
         Args:
-            model: model to save
-            path (string) : path where to save the model
+            model (nn.Module): model to save
+            path (str) : path where to save the model
 
         Returns:
 
         """
         torch.save(model, path)
 
-    def save_model(self, model, path, infer_structure=False, serialize_f_name='serialize'):
+    def save_model(self, model: nn.Module, path: str, infer_structure: bool = False,
+                   serialize_method_name: str = 'serialize') -> None:
         """
         Saves the model using pickle protocol.
 
         If the infer_structure is True this method infers all the meta parameters of the model and save them together
         with learnable parameters.
 
-        If the infer_structure is False and method specified by serialize_f_name exists, the return value of the
-        serialize_f_name method is saved.
+        If the infer_structure is False and method specified by serialize_method_name exists, the return value of the
+        serialize_method_name method is saved.
 
         If neither of above only learnable parameters a.k.a. state_dict are saved.
 
         Args:
-            model: model to save
-            path (string) : path where to save the model
+            model (nn.Module): model to save
+            path (str) : path where to save the model
             infer_structure (bool): indicator whether to infer the model structure
-            serialize_f_name (string): name of the function that this method should call in order to serialize the model
+            serialize_method_name (str): name of the function that this method should call in order to serialize the
+                model. Must return dict.
 
         Returns:
 
         """
-        entity_to_save = None
         if infer_structure:
             entity_to_save = self.get_constructor_args_with_values(model)
             entity_to_save['state_dict'] = model.state_dict()
-        elif hasattr(model, serialize_f_name) and callable(getattr(model, serialize_f_name)):
-            entity_to_save = getattr(model, serialize_f_name)()
+        elif hasattr(model, serialize_method_name) and callable(getattr(model, serialize_method_name)):
+            entity_to_save = getattr(model, serialize_method_name)()
         else:
-            entity_to_save['state_dict'] = model.state_dict()
+            entity_to_save = {'state_dict': model.state_dict()}
 
         torch.save(entity_to_save, path)
 
     @staticmethod
-    def get_constructor_args_with_values(model):
+    def get_constructor_args_with_values(model: nn.Module):
         """
         Assigns values to __init__ params names
 
@@ -75,7 +80,7 @@ def __init__(self, arg1, arg2):
 
 
         Args:
-            model: model to infre from
+            model (nn.Module): model to infre from
 
         Returns (dict): Mapping from __init__ argument to it's value
 
@@ -85,6 +90,79 @@ def __init__(self, arg1, arg2):
 
 class Loader:
 
-    def load_model(self, cls, path):
+    @deprecation.deprecated(
+        details="This type of loading may cause problems when path of model class changes. "
+                "Pleas use only when saved with save_model_simple method")
+    def load_model_simple(self, path: str):
+        """
+
+        Args:
+            path:
+
+        Returns:
+
+        """
+        return torch.load(path)
+
+    def load_model_from_path(self, cls_path: str, path: str, deserialize_method_name: str = 'deserialize') -> nn.Module:
+        """
+        Loads the model from pickle file.
+
+        If deserialize_method_name exists the deserialized content of pickle file in path is passed to the
+        deserialize_method_name method. In this case,
+        the responsibility of creating cls object stays at the caller side.
+
+        Args:
+            cls_path (str): name of the class of the model
+            path (str): path to pickle-serialized model or model parameters
+            deserialize_method_name (str): name of the function that this method should call in order to deserialize the
+                model. Must accept single argument of type dict.
+
+
+        Returns (nn.Module): Loaded model
+
+        """
+        return self.load_model(get_class_by_name(cls_path), path, deserialize_method_name)
+
+    def load_model(self, cls: type, path: str, deserialize_method_name: str = 'deserialize') -> nn.Module:
+        """
+        Loads the model from pickle file.
+
+        If deserialize_method_name exists the deserialized content of pickle file in path is passed to the
+        deserialize_method_name method. In this case,
+        the responsibility of creating cls object stays at the caller side.
+
+        Args:
+            cls (type): class object of the model
+            path (str): path to pickle-serialized model or model parameters
+            deserialize_method_name (str): name of the function that this method should call in order to deserialize the
+                model. Must accept single argument of type dict.
+
+
+        Returns (nn.Module): Loaded model
+
+        """
         package = torch.load(path, map_location=lambda storage, loc: storage)
-        cls()
+        if hasattr(cls, deserialize_method_name) and callable(getattr(cls, deserialize_method_name)):
+            return getattr(cls, deserialize_method_name)(package)
+        constructor_args = set(get_constructor_args(cls))
+        stored_keys = set(package.keys())
+        stored_keys.remove('state_dict')
+
+        args_to_apply = constructor_args & stored_keys
+        if len(args_to_apply) != len(constructor_args):
+            not_in_constructor = stored_keys - constructor_args
+            if not_in_constructor:
+                LOGGER.warning(
+                    f"Following fields were deserialized "
+                    f"but could not be found in constructor of provided class {not_in_constructor}")
+            not_in_package = constructor_args - stored_keys
+            if not_in_package:
+                LOGGER.warning(
+                    f"Following fields exist in class constructor "
+                    f"but could not be found in serialized package {not_in_package}")
+
+        filtered_package = {key: package[key] for key in stored_keys}
+        model = cls(**filtered_package)
+        model.load_state_dict(package['state_dict'])
+        return model

From 678095d49ecdefe3dbb4100bd334418d5f3f5292 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Mon, 17 Jun 2019 18:22:54 +0200
Subject: [PATCH 43/58] add experiment

---
 sonosco/common/utils.py        | 69 +++++++++++++++++++++++++++++++++-
 sonosco/training/experiment.py | 60 ++++++++++++++++-------------
 2 files changed, 102 insertions(+), 27 deletions(-)

diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py
index 5e86342..4dcbaba 100644
--- a/sonosco/common/utils.py
+++ b/sonosco/common/utils.py
@@ -1,6 +1,11 @@
 import logging
-import os
 import numpy as np
+import os
+import subprocess
+import os.path as path
+
+from shutil import copyfile
+from typing import Tuple
 
 
 def setup_logging(logger: logging.Logger, filename=None, verbosity=False):
@@ -24,3 +29,65 @@ def setup_logging(logger: logging.Logger, filename=None, verbosity=False):
 
 def random_float(low: float, high: float):
     return np.random.random() * (high - low) + low
+
+
+def copy_code(source_dir, dest_dir, exclude_dirs: Tuple[str] = tuple(), exclude_files: Tuple[str] = tuple()):
+    """
+    Copies code from source_dir to dest_dir. Excludes specified folders and files by substring-matching.
+    Parameters:
+        source_dir (string): location of the code to copy
+        dest_dir (string): location where the code should be copied to
+        exclude_dirs (list of strings): folders containing strings specified in this list will be ignored
+        exclude_files (list of strings): files containing strings specified in this list will be ignored
+    """
+    source_basename = path.basename(source_dir)
+    for root, dirs, files in os.walk(source_dir, topdown=True):
+
+        # skip ignored dirs
+        if any(ex_subdir in root for ex_subdir in exclude_dirs):
+            continue
+
+        # construct destination dir
+        cropped_root = root[2:] if (root[:2] == './') else root
+        subdir_basename = path.basename(cropped_root)
+
+        # do not treat the root as a subdir
+        if subdir_basename == source_basename:
+            subdir_basename = ""
+        dest_subdir = os.path.join(dest_dir, subdir_basename)
+
+        # create destination folder
+        if not os.path.exists(dest_subdir):
+            os.makedirs(dest_subdir)
+
+        # copy files
+        for filename in filter(lambda x: not any(substr in x for substr in exclude_files), files):
+            source_file_path = os.path.join(root, filename)
+            dest_file_path = os.path.join(dest_subdir, filename)
+            copyfile(source_file_path, dest_file_path)
+
+
+def retrieve_git_hash():
+    """
+    Retrieves and returns the current gith hash if execution location is a git repo.
+    """
+    try:
+        git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
+        return git_hash
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+    return False
+
+
+def save_run_params_in_file(folder_path, run_config):
+    """
+    Receives a run_config class, retrieves all member variables and saves them
+    in a config file for logging purposes.
+    Parameters:
+        folder_path - output folder
+        filename - output filename
+        run_config - shallow class with parameter members
+    """
+    with open(path.join(folder_path, "run_params.conf"), 'w') as run_param_file:
+        for attr, value in sorted(run_config.__dict__.items()):
+            run_param_file.write(f"{attr}: {value}\n")
diff --git a/sonosco/training/experiment.py b/sonosco/training/experiment.py
index dd04c46..23f46e0 100644
--- a/sonosco/training/experiment.py
+++ b/sonosco/training/experiment.py
@@ -1,11 +1,11 @@
 import os
 import os.path as path
 import sys
-from time import time
 import datetime
+import sonosco.common.path_utils as path_utils
 
-from .tee import Tee
-from .utils import *
+from time import time
+from sonosco.common.utils import copy_code
 
 
 class Experiment:
@@ -22,44 +22,49 @@ class Experiment:
         >>> print(experiment.plots) # path to experiment plots
     """
 
-    def __init__(self, experiment_name, experiments_path=None, exclude_dirs=[], exclude_files=[]):
+    def __init__(self,
+                 experiment_name,
+                 experiments_path=None,
+                 sub_directories=("plots", "logs", "code"),
+                 exclude_dirs=('__pycache__', '.git', 'experiments'),
+                 exclude_files=('.pyc',)):
+
         self.experiments_path = self._set_experiments_dir(experiments_path)
         self.name = self._set_experiment_name(experiment_name)
-        self.path = path.join(self.experiments_path, self.name) # path to current experiment
-        self._sub_directories = ['plots', 'logs', 'code'] # default sub-directories
+        self.path = path.join(self.experiments_path, self.name)     # path to current experiment
+        self.logs = path.join(self.experiments_path, "logs")
+        self.code = path.join(self.experiments_path, "code")
+        self._sub_directories = sub_directories
 
-        self._exclude_dirs = ['__pycache__', '.git', 'experiments']
+        self._exclude_dirs = exclude_dirs
         self._exclude_dirs.extend(exclude_dirs)
-        self._exclude_files = ['.pyc']
+        self._exclude_files = exclude_files
         self._exclude_files.extend(exclude_files)
 
-
         self._init_directories()
-        self._tee = Tee(path.join(self.logs, 'console_output.log'), 'w') # start to log console
         self._copy_sourcecode()
 
-    def _set_experiments_dir(self, experiments_path):
-        if experiments_path != None:
+    @staticmethod
+    def _set_experiments_dir(experiments_path):
+        if experiments_path is not None:
             return experiments_path
+
         local_path = os.path.dirname(sys.argv[0])
         local_path = local_path if local_path != '' else './'
         return path.join(local_path, "experiments")
 
-    def _set_experiment_name(self, experiment_name):
+    @staticmethod
+    def _set_experiment_name(experiment_name):
         date_time = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H:%M:%S')
-        return date_time + "_" + experiment_name
+        return f"{date_time}_{experiment_name}"
 
     def _init_directories(self):
         """ Create all basic directories. """
-        self._create_directory(self.experiments_path)
-        self._create_directory(path.join(self.experiments_path, self.name))
+        path_utils.try_create_directory(self.experiments_path)
+        path_utils.try_create_directory(path.join(self.experiments_path, self.name))
         for sub_dir_name in self._sub_directories:
             self.add_directory(sub_dir_name)
 
-    def _create_directory(self, dir_path):
-        if not path.exists(dir_path):
-            os.makedirs(dir_path)
-
     def _add_member(self, key, value):
         """ Add a member variable named 'key' with value 'value' to the experiment instance. """
         self.__dict__[key] = value
@@ -68,7 +73,9 @@ def _copy_sourcecode(self):
         """ Copy code from execution directory in experiment code directory. """
         sources_path = os.path.dirname(sys.argv[0])
         sources_path = sources_path if sources_path != '' else './'
-        copy_code(sources_path, self.code, exclude_dirs=self._exclude_dirs, exclude_files=self._exclude_files )# exclude_dirs=[path.basename(self.experiments_path), '.vscode', '.git'])
+        copy_code(sources_path, self.code,
+                  exclude_dirs=self._exclude_dirs,
+                  exclude_files=self._exclude_files)
 
     def add_directory(self, dir_name):
         """
@@ -76,15 +83,16 @@ def add_directory(self, dir_name):
         created and provided to the outside as a member variable.
         """
         # store in sub-dir list
-        if not dir_name in self._sub_directories:
+        if dir_name not in self._sub_directories:
             self._sub_directories.append(dir_name)
         # add as member
         dir_path = path.join(self.experiments_path, self.name, dir_name)
         self._add_member(dir_name, dir_path)
         # create directory
-        self._create_directory(dir_path)
+        path_utils.try_create_directory(dir_path)
 
-    def add_file(self, folder_path, filename, content):
+    @staticmethod
+    def add_file(folder_path, filename, content):
         """ Adds a file with provided content to folder. Convenience function. """
-        with open(path.join(folder_path, filename), 'w') as textfile:
-            textfile.write(content)
\ No newline at end of file
+        with open(path.join(folder_path, filename), 'w') as text_file:
+            text_file.write(content)

From fbb90023f9734fa01e8d70fe9628567beec03cc7 Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Tue, 18 Jun 2019 22:37:23 +0200
Subject: [PATCH 44/58] delete test script

---
 sonosco/datasets/datasets_test_script.py | 33 ------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 sonosco/datasets/datasets_test_script.py

diff --git a/sonosco/datasets/datasets_test_script.py b/sonosco/datasets/datasets_test_script.py
deleted file mode 100644
index f4daf79..0000000
--- a/sonosco/datasets/datasets_test_script.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os
-
-from AudioDataLoader import AudioDataLoader
-from AudioDataSampler import BucketingSampler, DistributedBucketingSampler
-from AudioDataset import AudioDataset
-
-
-def main():
-    labels_path = "/Users/florianlay/roboy/sonosco/sonosco/datasets/labels.json"
-    with open(labels_path) as label_file:
-        labels = str(''.join(json.load(label_file)))
-
-    audio_conf = dict(sample_rate=16000,
-                      window_size=.02,
-                      window_stride=.01,
-                      window='hamming')
-
-    manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech")
-    test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv")
-    labels = 'abc'
-    test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels,
-                                 normalize=False, augment=False)
-    print("Dataset is created\n====================\n")
-
-    test = test_dataset[0]
-    batch_size = 16
-    sampler = BucketingSampler(test_dataset, batch_size=batch_size)
-    dataloader = AudioDataLoader(test_dataset,num_workers=4, batch_sampler=sampler)
-
-    inputs, targets, input_percentages, target_sizes = next(iter(dataloader))
-    print(test)
-if __name__ == "__main__":
-    main()
\ No newline at end of file

From 0590f01fc00dc0ea296baf9eccaad35dc27eb083 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Wed, 19 Jun 2019 17:34:35 +0200
Subject: [PATCH 45/58] start creating training interface

---
 .gitignore                                    |   2 +
 sonosco/common/path_utils.py                  |   7 +
 sonosco/common/utils.py                       |  24 +-
 sonosco/config/train.yaml                     |   1 +
 sonosco/datasets/__init__.py                  |   3 +
 sonosco/datasets/audio_data_loader.py         |  33 ---
 sonosco/datasets/audio_dataset.py             |   4 +-
 sonosco/datasets/data_loader.py               |  59 ++++
 ...{audio_data_sampler.py => data_sampler.py} |   0
 .../datasets/download_datasets/__init__.py    |   3 +
 sonosco/models/__init__.py                    |   1 +
 sonosco/run_training.py                       |  39 +++
 sonosco/train.py                              |  18 --
 sonosco/training/__init__.py                  |   2 +
 sonosco/training/abstract_callback.py         |  25 ++
 sonosco/training/experiment.py                |  28 +-
 sonosco/training/helpers.py                   | 143 ++++++++++
 sonosco/training/learning_rates.py            | 131 +++++++++
 sonosco/training/trainer.py                   | 252 ++++++++++++++++++
 tests/test_dataset.py                         |   8 +-
 20 files changed, 708 insertions(+), 75 deletions(-)
 delete mode 100644 sonosco/datasets/audio_data_loader.py
 create mode 100644 sonosco/datasets/data_loader.py
 rename sonosco/datasets/{audio_data_sampler.py => data_sampler.py} (100%)
 create mode 100644 sonosco/run_training.py
 delete mode 100644 sonosco/train.py
 create mode 100644 sonosco/training/abstract_callback.py
 create mode 100644 sonosco/training/helpers.py
 create mode 100644 sonosco/training/learning_rates.py
 create mode 100644 sonosco/training/trainer.py

diff --git a/.gitignore b/.gitignore
index 453b1ff..e2e1859 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 # Created by .ignore support plugin (hsz.mobi)
 sonosco/pycandle/
+experiments/
+tests/test_wavs/
 sonosco/pycandle
 sonosco/experiments/
 sonosco/datasets/download_datasets/
diff --git a/sonosco/common/path_utils.py b/sonosco/common/path_utils.py
index bf8c8ef..03321bb 100644
--- a/sonosco/common/path_utils.py
+++ b/sonosco/common/path_utils.py
@@ -1,5 +1,7 @@
 import os
 import wget
+import yaml
+import codecs
 
 
 def try_create_directory(path: str):
@@ -10,3 +12,8 @@ def try_create_directory(path: str):
 def try_download(destination: str, url: str):
     if not os.path.exists(destination):
         wget.download(url, destination)
+
+
+def parse_yaml(file_path: str):
+    with codecs.open(file_path, "r", "utf-8") as file:
+        return yaml.load(file)
diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py
index 4dcbaba..d5b8d25 100644
--- a/sonosco/common/utils.py
+++ b/sonosco/common/utils.py
@@ -10,16 +10,10 @@
 
 def setup_logging(logger: logging.Logger, filename=None, verbosity=False):
     logger.setLevel(logging.DEBUG)
+
     if filename is not None:
-        log_directory = os.path.dirname(filename)
-        if not os.path.exists(log_directory):
-            os.makedirs(log_directory)
-        filename = os.path.join(log_directory, f"{filename}.log")
-        f_handler = logging.FileHandler(filename=filename, mode="w")
-        f_handler.setLevel(logging.DEBUG)
-        f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-        f_handler.setFormatter(f_format)
-        logger.addHandler(f_handler)
+        add_log_file(filename, logger)
+
     c_handler = logging.StreamHandler()
     c_handler.setLevel(logging.DEBUG) if verbosity else c_handler.setLevel(logging.INFO)
     c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -27,6 +21,18 @@ def setup_logging(logger: logging.Logger, filename=None, verbosity=False):
     logger.addHandler(c_handler)
 
 
+def add_log_file(filename: str, logger: logging.Logger):
+    log_directory = os.path.dirname(filename)
+    if not os.path.exists(log_directory):
+        os.makedirs(log_directory)
+    filename = os.path.join(log_directory, f"{filename}.log")
+    f_handler = logging.FileHandler(filename=filename, mode="w")
+    f_handler.setLevel(logging.DEBUG)
+    f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    f_handler.setFormatter(f_format)
+    logger.addHandler(f_handler)
+
+
 def random_float(low: float, high: float):
     return np.random.random() * (high - low) + low
 
diff --git a/sonosco/config/train.yaml b/sonosco/config/train.yaml
index 25be72c..3e9cba9 100644
--- a/sonosco/config/train.yaml
+++ b/sonosco/config/train.yaml
@@ -16,6 +16,7 @@ train:
   hidden_size: 800 # Hidden size of RNNs
   hidden_layers: 5 # Number of RNN layers
   rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported
+  labels: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' # labels used by the model
 
   max_epochs: 70 # Number of training epochs
   learning_rate: 3e-4 # Initial learning rate
diff --git a/sonosco/datasets/__init__.py b/sonosco/datasets/__init__.py
index e69de29..9cfa7a3 100644
--- a/sonosco/datasets/__init__.py
+++ b/sonosco/datasets/__init__.py
@@ -0,0 +1,3 @@
+from .audio_dataset import AudioDataProcessor, AudioDataset
+from .data_sampler import BucketingSampler
+from .data_loader import AudioDataLoader, create_data_loaders
diff --git a/sonosco/datasets/audio_data_loader.py b/sonosco/datasets/audio_data_loader.py
deleted file mode 100644
index 967b89c..0000000
--- a/sonosco/datasets/audio_data_loader.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import numpy as np
-import torch
-
-from torch.utils.data import Dataset, DataLoader, Sampler
-
-
-class AudioDataLoader(DataLoader):
-
-    def __init__(self, *args, **kwargs):
-        """Creates a data loader for AudioDatasets."""
-        super(AudioDataLoader, self).__init__(*args, **kwargs)
-        self.collate_fn = self._collate_fn
-
-    # TODO: Optimise
-    def _collate_fn(self, batch):
-        batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
-        longest_sample = batch[0][0]
-        freq_size, max_seqlength = longest_sample.size()
-        minibatch_size = len(batch)
-        inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
-        input_percentages = torch.FloatTensor(minibatch_size)
-        target_sizes = np.zeros(minibatch_size, dtype=np.int32)
-
-        # TODO: Numpy broadcasting magic
-        targets = []
-
-        for x in range(minibatch_size):
-            inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
-            input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
-            target_sizes[x] = len(batch[x][1])
-            targets.extend(batch[x][1])
-
-        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
\ No newline at end of file
diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py
index 0dea305..c82e697 100644
--- a/sonosco/datasets/audio_dataset.py
+++ b/sonosco/datasets/audio_dataset.py
@@ -22,7 +22,7 @@
 MAX_SHIFT = 4000
 
 
-class DataProcessor:
+class AudioDataProcessor:
 
     def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False):
         """
@@ -104,7 +104,7 @@ def parse_transcript(self, transcript_path):
 
 class AudioDataset(Dataset):
 
-    def __init__(self, processor: DataProcessor, manifest_filepath):
+    def __init__(self, processor: AudioDataProcessor, manifest_filepath):
         """
         Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
         a comma. Each new line is a different sample. Example below:
diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/data_loader.py
new file mode 100644
index 0000000..b9f1115
--- /dev/null
+++ b/sonosco/datasets/data_loader.py
@@ -0,0 +1,59 @@
+import numpy as np
+import logging
+import torch
+
+from torch.utils.data import Dataset, DataLoader, Sampler
+from .audio_dataset import AudioDataProcessor, AudioDataset
+from .data_sampler import BucketingSampler
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class AudioDataLoader(DataLoader):
+
+    def __init__(self, *args, **kwargs):
+        """Creates a data loader for AudioDatasets."""
+        super(AudioDataLoader, self).__init__(*args, **kwargs)
+        self.collate_fn = self._collate_fn
+
+    # TODO: Optimise
+    def _collate_fn(self, batch):
+        batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
+        longest_sample = batch[0][0]
+        freq_size, max_seqlength = longest_sample.size()
+        minibatch_size = len(batch)
+        inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
+        input_percentages = torch.FloatTensor(minibatch_size)
+        target_sizes = np.zeros(minibatch_size, dtype=np.int32)
+
+        # TODO: Numpy broadcasting magic
+        targets = []
+
+        for x in range(minibatch_size):
+            inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
+            input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
+            target_sizes[x] = len(batch[x][1])
+            targets.extend(batch[x][1])
+
+        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
+
+
+def create_data_loaders(train_manifest, val_manifest, batch_size, num_data_workers, **kwargs):
+    processor = AudioDataProcessor(**kwargs)
+
+    # create train loader
+    train_dataset = AudioDataset(processor, manifest_filepath=train_manifest)
+    LOGGER.info(f"Training dataset containing {len(train_dataset)} samples is created")
+    sampler = BucketingSampler(train_dataset, batch_size=batch_size)
+    train_loader = AudioDataLoader(dataset=train_dataset, num_workers=num_data_workers, batch_sampler=sampler)
+    LOGGER.info("Training data loader created.")
+
+    # create validation loader
+    val_dataset = AudioDataset(processor, manifest_filepath=val_manifest)
+    LOGGER.info(f"Validation dataset containing {len(val_dataset)} samples is created")
+    sampler = BucketingSampler(val_dataset, batch_size=batch_size)
+    val_loader = AudioDataLoader(dataset=val_dataset, num_workers=num_data_workers, batch_sampler=sampler)
+    LOGGER.info("Validation data loader created.")
+
+    return train_loader, val_loader
diff --git a/sonosco/datasets/audio_data_sampler.py b/sonosco/datasets/data_sampler.py
similarity index 100%
rename from sonosco/datasets/audio_data_sampler.py
rename to sonosco/datasets/data_sampler.py
diff --git a/sonosco/datasets/download_datasets/__init__.py b/sonosco/datasets/download_datasets/__init__.py
index e69de29..2ef1650 100644
--- a/sonosco/datasets/download_datasets/__init__.py
+++ b/sonosco/datasets/download_datasets/__init__.py
@@ -0,0 +1,3 @@
+def download_all_datasets(path: str):
+    """Downloads all datasets which are missing."""
+    pass
diff --git a/sonosco/models/__init__.py b/sonosco/models/__init__.py
index e69de29..eaab54d 100644
--- a/sonosco/models/__init__.py
+++ b/sonosco/models/__init__.py
@@ -0,0 +1 @@
+from .deepspeech2 import DeepSpeech2
\ No newline at end of file
diff --git a/sonosco/run_training.py b/sonosco/run_training.py
new file mode 100644
index 0000000..62222f7
--- /dev/null
+++ b/sonosco/run_training.py
@@ -0,0 +1,39 @@
+import logging
+import click
+import torch.nn.functional as torch_functional
+
+from sonosco.common.constants import SONOSCO
+from sonosco.common.utils import setup_logging
+from sonosco.common.path_utils import parse_yaml
+from sonosco.training import Experiment, ModelTrainer
+from sonosco.datasets import create_data_loaders
+from sonosco.models import DeepSpeech2
+
+LOGGER = logging.getLogger(SONOSCO)
+
+
+@click.command()
+@click.option("-e", "--experiment_name", default="default", type=click.STRING, help="Experiment name.")
+@click.option("-c", "--config_path", default="config/train.yaml", type=click.Path, help="Path to train configurations.")
+def main(experiment_name, config_path):
+    Experiment.create(experiment_name)
+    config = parse_yaml(config_path)
+
+    train_loader, val_loader = create_data_loaders(**config)
+
+    # TODO: change to load different models dynamically
+    model = DeepSpeech2()
+
+    trainer = ModelTrainer(model, loss=torch_functional.ctc_loss, epochs=config["max_epochs"],
+                           train_data_loader=train_loader, val_data_loader=val_loader,
+                           lr=config["learning_rate"])
+
+    try:
+        trainer.start_training()
+    except KeyboardInterrupt:
+        trainer.stop_training()
+
+
+if __name__ == '__main__':
+    setup_logging(LOGGER)
+    main()
diff --git a/sonosco/train.py b/sonosco/train.py
deleted file mode 100644
index a763a47..0000000
--- a/sonosco/train.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import argparse
-from typing import Dict
-
-import yaml
-
-from modelwrapper import ModelWrapper
-
-parser = argparse.ArgumentParser(description='ASR training')
-parser.add_argument('--config', metavar='DIR',
-                    help='Path to train config file', default='config/train.yaml')
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    with open(args.config, 'r') as file:
-        config = yaml.load(file)
-    config_dict: Dict = config["train"]
-    model = ModelWrapper(**config_dict)
-    model.train()
diff --git a/sonosco/training/__init__.py b/sonosco/training/__init__.py
index e69de29..05caba2 100644
--- a/sonosco/training/__init__.py
+++ b/sonosco/training/__init__.py
@@ -0,0 +1,2 @@
+from .experiment import Experiment
+from .trainer import ModelTrainer
diff --git a/sonosco/training/abstract_callback.py b/sonosco/training/abstract_callback.py
new file mode 100644
index 0000000..8d920b6
--- /dev/null
+++ b/sonosco/training/abstract_callback.py
@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+
+
+class AbstractCallback(ABC):
+    """
+    Interface that defines how callbacks must be specified.
+    """
+
+    @abstractmethod
+    def __call__(self, epoch, step, performance_measures, context):
+        """
+        Called after every batch by the ModelTrainer.
+        Parameters:
+            epoch (int): current epoch number
+            step (int): current batch number
+            performance_measures (dict): losses and metrics based on a running average
+            context (ModelTrainer): reference to the calling ModelTrainer, allows to access members
+        """
+        pass
+
+    def close(self):
+        """
+        Handle cleanup work if necessary. Will be called at the end of the last epoch.
+        """
+        pass
diff --git a/sonosco/training/experiment.py b/sonosco/training/experiment.py
index 23f46e0..07fd1cf 100644
--- a/sonosco/training/experiment.py
+++ b/sonosco/training/experiment.py
@@ -1,11 +1,14 @@
 import os
 import os.path as path
-import sys
 import datetime
+import logging
 import sonosco.common.path_utils as path_utils
+import sonosco.common.utils as utils
 
 from time import time
-from sonosco.common.utils import copy_code
+
+
+LOGGER = logging.getLogger(__name__)
 
 
 class Experiment:
@@ -33,23 +36,23 @@ def __init__(self,
         self.name = self._set_experiment_name(experiment_name)
         self.path = path.join(self.experiments_path, self.name)     # path to current experiment
         self.logs = path.join(self.experiments_path, "logs")
+
         self.code = path.join(self.experiments_path, "code")
         self._sub_directories = sub_directories
 
         self._exclude_dirs = exclude_dirs
-        self._exclude_dirs.extend(exclude_dirs)
         self._exclude_files = exclude_files
-        self._exclude_files.extend(exclude_files)
 
         self._init_directories()
         self._copy_sourcecode()
+        self._set_logging()
 
     @staticmethod
     def _set_experiments_dir(experiments_path):
         if experiments_path is not None:
             return experiments_path
 
-        local_path = os.path.dirname(sys.argv[0])
+        local_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
         local_path = local_path if local_path != '' else './'
         return path.join(local_path, "experiments")
 
@@ -58,6 +61,9 @@ def _set_experiment_name(experiment_name):
         date_time = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H:%M:%S')
         return f"{date_time}_{experiment_name}"
 
+    def _set_logging(self):
+        utils.add_log_file(self.logs, LOGGER)
+
     def _init_directories(self):
         """ Create all basic directories. """
         path_utils.try_create_directory(self.experiments_path)
@@ -71,11 +77,11 @@ def _add_member(self, key, value):
 
     def _copy_sourcecode(self):
         """ Copy code from execution directory in experiment code directory. """
-        sources_path = os.path.dirname(sys.argv[0])
+        sources_path = os.path.dirname(os.path.dirname(__file__))
         sources_path = sources_path if sources_path != '' else './'
-        copy_code(sources_path, self.code,
-                  exclude_dirs=self._exclude_dirs,
-                  exclude_files=self._exclude_files)
+        utils.copy_code(sources_path, self.code,
+                        exclude_dirs=self._exclude_dirs,
+                        exclude_files=self._exclude_files)
 
     def add_directory(self, dir_name):
         """
@@ -96,3 +102,7 @@ def add_file(folder_path, filename, content):
         """ Adds a file with provided content to folder. Convenience function. """
         with open(path.join(folder_path, filename), 'w') as text_file:
             text_file.write(content)
+
+    @staticmethod
+    def create(name: str):
+        return Experiment(name)
diff --git a/sonosco/training/helpers.py b/sonosco/training/helpers.py
new file mode 100644
index 0000000..893c14d
--- /dev/null
+++ b/sonosco/training/helpers.py
@@ -0,0 +1,143 @@
+import logging
+import sys
+import os.path as path
+import numpy as np
+import torch
+
+from collections import defaultdict
+from .abstract_callback import AbstractCallback
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class HistoryRecorder(AbstractCallback):
+    """ Records all losses and metrics during training. """
+
+    def __init__(self, epoch_steps):
+        self.history = defaultdict(list)
+        self._epoch_steps = epoch_steps
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if step % self._epoch_steps == 0:  # only record at end of epoch
+            return
+
+        for key, value in performance_measures.items():
+            if type(value) == torch.Tensor:
+                value = value.item()
+            self.history[key].append(value)
+
+
+class ModelCheckpoint(AbstractCallback):
+    """
+    Saves the model and optimizer state at the point with lowest validation error throughout training.
+    Args:
+        output_path (string): path to directory where the checkpoint will be saved to
+        model_name (string): name of the checkpoint file
+    """
+
+    def __init__(self, output_path, model_name='model_checkpoint.pt'):
+        self.output_path = path.join(output_path, model_name)
+        self.best_val_score = sys.float_info.max
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if 'val_loss' not in performance_measures:
+            return
+
+        if performance_measures['val_loss'] < self.best_val_score:
+            self.best_val_score = performance_measures['val_loss']
+            self._save_checkpoint(context.model, context.optimizer, epoch)
+
+    def _save_checkpoint(self, model, optimizer, epoch):
+        LOGGER.info("Saving model at checkpoint.")
+        model.eval()
+        model_state_dict = model.state_dict()
+        optimizer_state_dict = optimizer.state_dict()
+        torch.save({'arch': model.__class__.__name__,
+                    'epoch': epoch,
+                    'model_state_dict': model_state_dict,
+                    'optimizer_state_dict': optimizer_state_dict
+                    }, self.output_path)
+        model.train()
+
+
+class LayerwiseGradientNorm(AbstractCallback):
+    """ Collects the layer-wise gradient norms for each epoch. """
+
+    def __init__(self):
+        self.layer_grads = dict()
+        self._batch_layer_grads = dict()
+
+    def __call__(self, epoch, step, performance_measures, context):
+        """
+        Store gradient norms for each batch and compute means after the
+        epoch's last batch.
+        """
+        self._store_batch_layer_grads(context.model)
+
+        if step == (len(context.train_data_loader) - 1):    # end of epoch
+            self._store_layer_grads()
+            self._batch_layer_grads = dict()
+
+    def _store_batch_layer_grads(self, model):
+        """ Store gradient norm of each layer for current batch. """
+        for name, param in model.named_parameters():
+
+            if not param.requires_grad or param.grad is None:
+                continue
+
+            if not name in self._batch_layer_grads:
+                self._batch_layer_grads[name] = []
+
+            grad_norm = torch.sqrt(torch.sum(param.grad**2)).item()
+            self._batch_layer_grads[name].append(grad_norm)
+
+    def _store_layer_grads(self):
+        """ Compute mean of all batch steps in epoch. """
+        for name, grads in self._batch_layer_grads.items():
+
+            if name not in self.layer_grads:
+                self.layer_grads[name] = []
+
+            layer_epoch_grad = np.mean(grads)
+            self.layer_grads[name].append(layer_epoch_grad)
+
+
+class EarlyStopping(AbstractCallback):
+    """
+    Early Stopping to terminate training early if the monitored metric did not improve
+    over a number of epochs.
+    Args:
+        monitor (string): name of the relevant loss or metric (usually 'val_loss')
+        min_delta (float): minimum change in monitored metric to qualify as an improvement
+        patience (int): number of epochs to wait for an improvement before terminating the training
+    """
+
+    def __init__(self, monitor='val_loss', min_delta=0, patience=5):
+        self.monitor = monitor
+        self.min_delta = min_delta
+        self.patience = patience
+        self.last_best = sys.float_info.max
+        self.counter = 0
+        self.stopped_epoch = 0
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if step != len(context.train_data_loader) - 1:  # only continue at end of epoch
+            return
+
+        if self.monitor not in performance_measures:
+            return
+
+        current_loss = performance_measures[self.monitor]
+        if (self.last_best - current_loss) >= self.min_delta:
+            self.last_best = current_loss
+            self.counter = 0
+        else:
+            self.counter += 1
+
+        if self.counter >= self.patience:
+            context._stop_training = True   # make ModelTrainer stop
+            LOGGER.info(f"Early stopping after epoch {epoch}")
diff --git a/sonosco/training/learning_rates.py b/sonosco/training/learning_rates.py
new file mode 100644
index 0000000..e977514
--- /dev/null
+++ b/sonosco/training/learning_rates.py
@@ -0,0 +1,131 @@
+import logging
+import sys
+
+from .abstract_callback import AbstractCallback
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class StepwiseLearningRateReduction(AbstractCallback):
+    """
+    Reduces the learning rate of the optimizer every N epochs.
+    Args:
+        epoch_steps (int): number of epochs after which learning rate is reduced
+        reduction_factor (float): multiplicative factor for learning rate reduction
+        min_lr (float): lower bound for learning rate
+    """
+
+    def __init__(self, epoch_steps, reduction_factor, min_lr=None):
+        self._epoch_steps = epoch_steps
+        self._reduction_factor = reduction_factor
+        self._min_lr = min_lr
+
+    def __call__(self, epoch, step, performance_measures, context):
+        # execute at the beginning of every Nth epoch
+        if epoch > 0 and step == 0 and epoch % self._epoch_steps == 0:
+
+            # reduce lr for each param group (necessary for e.g. Adam)
+            for param_group in context.optimizer.param_groups:
+                new_lr = param_group['lr'] * self._reduction_factor
+
+                if self._min_lr is not None and new_lr < self._min_lr:
+                    continue
+
+                param_group['lr'] = new_lr
+                LOGGER.info("Epoch {}: Reducing learning rate to {}".format(epoch, new_lr))
+
+
+class ScheduledLearningRateReduction(AbstractCallback):
+    """
+    Reduces the learning rate of the optimizer for every scheduled epoch.
+    Args:
+        epoch_schedule (list of int): defines at which epoch the learning rate will be reduced
+        reduction_factor (float): multiplicative factor for learning rate reduction
+        min_lr (float): lower bound for learning rate
+    """
+
+    def __init__(self, epoch_schedule, reduction_factor, min_lr=None):
+        self._epoch_schedule = sorted(epoch_schedule)
+        self._reduction_factor = reduction_factor
+        self._min_lr = min_lr
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if not self._epoch_schedule:    # stop if schedule is empty
+            return
+
+        next_epoch_step = self._epoch_schedule[0]
+        if epoch >= next_epoch_step and step == 0:
+
+            # reduce lr for each param group (necessary for e.g. Adam)
+            for param_group in context.optimizer.param_groups:
+                new_lr = param_group['lr'] * self._reduction_factor
+
+                if self._min_lr is not None and new_lr < self._min_lr:
+                    continue
+
+                param_group['lr'] = new_lr
+                LOGGER.info("Epoch {}: Reducing learning rate to {}".format(epoch, new_lr))
+
+            self._epoch_schedule.pop(0)
+
+
+class ReduceLROnPlateau(AbstractCallback):
+    """
+    Reduce the learning rate if the train or validation loss plateaus.
+    Args:
+        monitor (string): name of the relevant loss or metric (usually 'val_loss')
+        factor (float): factor by which the lr is decreased at each step
+        patience (int): number of epochs to wait on plateau for loss improvement before reducing lr
+        min_delta (float): minimum improvement necessary to reset patience
+        cooldown (int): number of epochs to cooldown after a lr reduction
+        min_lr (float): minimum value the learning rate can decrease to
+        verbose (bool): print to console
+    """
+
+    def __init__(self, monitor='val_loss', factor=0.1, patience=10, min_delta=0, cooldown=0, min_lr=0, verbose=False):
+        self.monitor = monitor
+        if factor >= 1.0 or factor < 0:
+            raise ValueError('ReduceLROnPlateau does only support a factor in [0,1[.')
+        self.factor = factor
+        self.min_lr = min_lr
+        self.min_delta = min_delta
+        self.patience = patience
+        self.verbose = verbose
+        self.cooldown = cooldown
+        self.cooldown_counter = 0
+        self.wait = 0
+        self.best_loss = sys.float_info.max
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if self.monitor not in performance_measures:
+            return
+
+        if step != len(context.train_data_loader)-1: # only continue at end of epoch
+            return
+
+        if self.cooldown_counter > 0:   # in cooldown phase
+            self.cooldown_counter -= 1
+            self.wait = 0
+
+        current_loss = performance_measures[self.monitor]
+        if (self.best_loss - current_loss) >= self.min_delta:   # loss improved, save and reset wait counter
+            self.best_loss = current_loss
+            self.wait = 0
+
+        elif self.cooldown_counter <= 0:    # no improvement and not in cooldown
+
+            if self.wait >= self.patience:  # waited long enough, reduce lr
+                for param_group in context.optimizer.param_groups:
+                    old_lr = param_group['lr']
+                    new_lr = old_lr * self.factor
+                    if new_lr >= self.min_lr:   # only decrease if there is still enough buffer space
+                        if self.verbose:
+                            LOGGER.info("Epoch {}: Reducing learning rate from {} to {}".format(epoch, old_lr, new_lr)) #TODO print per param group?
+                        param_group['lr'] = new_lr
+                self.cooldown_counter = self.cooldown   # new cooldown phase after lr reduction
+                self.wait = 0
+            else:
+                self.wait += 1
diff --git a/sonosco/training/trainer.py b/sonosco/training/trainer.py
new file mode 100644
index 0000000..ba438fa
--- /dev/null
+++ b/sonosco/training/trainer.py
@@ -0,0 +1,252 @@
+import logging
+import torch
+import torch.optim.optimizer
+import torch.nn.utils.clip_grad as grads
+
+from collections import defaultdict
+from typing import Callable, Union, Tuple, List, Any
+from torch.utils.data import DataLoader
+from .abstract_callback import AbstractCallback
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ModelTrainer:
+    """
+    This class handles the training of a pytorch model. It provides convenience
+    functionality to add metrics and callbacks and is inspired by the keras API.
+    Args:
+        model (nn.Module): model to be trained
+        optimizer (optim.Optimizer): optimizer used for training, e.g. torch.optim.Adam
+        loss (function): loss function that either accepts (model_output, label) or (input, label, model) if custom_model_eval is true
+        epochs (int): epochs to train
+        train_data_loader (utils.data.DataLoader): training data
+        val_data_loader (utils.data.DataLoader, optional): validation data
+        custom_model_eval (boolean, optional): enables training mode where the model is evaluated in the loss function
+        gpu (int, optional): if not set training runs on cpu, otherwise an int is expected that determines the training gpu
+        clip_grads (float, optional): if set training gradients will be clipped at specified norm
+    Example:
+        >>> model_trainer = ModelTrainer(model, optimizer, F.nll_loss, num_epochs, train_loader, gpu=0)
+        >>> model_trainer.start_training()
+    """
+
+    def __init__(self,
+                 model: torch.nn.Module,
+                 loss: Union[Callable[[torch.Tensor, torch.Tensor], float],
+                             Callable[[torch.Tensor, torch.Tensor, torch.nn.Module], float]],
+                 epochs: int,
+                 train_data_loader: DataLoader,
+                 val_data_loader: DataLoader = None,
+                 optimizer=torch.optim.Adam,
+                 lr: float = 1e-4,
+                 custom_model_eval: bool = False,
+                 gpu: int = None,
+                 clip_grads: float = None,
+                 metrics: List[Callable[[torch.Tensor, Any], Union[float, torch.Tensor]]] = None,
+                 callbacks: List[AbstractCallback] = None):
+
+        self.model = model
+        self.train_data_loader = train_data_loader
+        self.val_data_loader = val_data_loader
+        self.optimizer = optimizer(self.model.parameters(), lr=lr)
+        self.loss = loss
+        self._epochs = epochs
+        self._metrics = metrics if metrics is not None else list()
+        self._callbacks = callbacks if callbacks is not None else list()
+        self._gpu = gpu
+        self._custom_model_eval = custom_model_eval
+        self._clip_grads = clip_grads
+        self._stop_training = False  # used stop training externally
+
+    def set_metrics(self, metrics):
+        """
+        Set metric functions that receive y_pred and y_true. Metrics are expected to return
+        a basic numeric type like float or int.
+        """
+        self._metrics = metrics
+
+    def add_metric(self, metric):
+        self._metrics.append(metric)
+
+    def set_callbacks(self, callbacks):
+        """
+        Set callbacks that are callable functionals and receive epoch, step, loss, context.
+        Context is a pointer to the ModelTrainer instance. Callbacks are called after each
+        processed batch.
+        """
+        self._callbacks = callbacks
+
+    def add_callback(self, callback):
+        self._callbacks.append(callback)
+
+    def start_training(self):
+        self.model.train()  # train mode
+        for epoch in range(1, self._epochs + 1):
+            self._epoch_step(epoch)
+
+            if self._stop_training:
+                break
+
+        self._close_callbacks()
+
+    def _epoch_step(self, epoch):
+        """ Execute one training epoch. """
+        running_batch_loss = 0
+        running_metrics = defaultdict(float)
+
+        for step, batch in enumerate(self.train_data_loader):
+            batch = self._recursive_to_cuda(batch)  # move to GPU
+
+            # compute training batch
+            loss, model_output, grad_norm = self._train_on_batch(batch)
+            running_batch_loss += loss.item()
+
+            # compute metrics
+            self._compute_running_metrics(model_output, batch, running_metrics)
+            running_metrics['gradient_norm'] += grad_norm  # add grad norm to metrics
+
+            # evaluate validation set at end of epoch
+            if self.val_data_loader and step == (len(self.train_data_loader) - 1):
+                self._compute_validation_error(running_metrics)
+
+            # print current loss and metrics and provide it to callbacks
+            performance_measures = self._construct_performance_dict(step, running_batch_loss, running_metrics)
+            self._print_step_info(epoch, step, performance_measures)
+            self._apply_callbacks(epoch, step, performance_measures)
+
+    def stop_training(self):
+        self._stop_training = True
+
+    def _comp_gradients(self):
+        """ Compute the gradient norm for all model parameters. """
+        grad_sum = 0
+        for param in self.model.parameters():
+            if param.requires_grad and param.grad is not None:
+                grad_sum += torch.sum(param.grad ** 2)
+        grad_norm = torch.sqrt(grad_sum).item()
+        return grad_norm
+
+    def _train_on_batch(self, batch):
+        """ Compute loss depending on settings, compute gradients and apply optimization step. """
+        # evaluate loss
+        batch_x, batch_y = batch
+        if self._custom_model_eval:
+            loss, model_output = self.loss(batch, self.model)
+        else:
+            model_output = self.model(batch_x)
+            loss = self.loss(model_output, batch_y)
+
+        self.optimizer.zero_grad()  # reset gradients
+        loss.backward()  # backpropagation
+
+        # gradient clipping
+        if self._clip_grads is not None:
+            grads.clip_grad_norm(self.model.parameters(), self._clip_grads)
+
+        grad_norm = self._comp_gradients()  # compute average gradient norm
+
+        self.optimizer.step()  # apply optimization step
+        return loss, model_output, grad_norm
+
+    def _compute_validation_error(self, running_metrics):
+        """ Evaluate the model's validation error. """
+        running_val_loss = 0
+
+        self.model.eval()
+        for batch in self.val_data_loader:
+            batch = self._recursive_to_cuda(batch)
+
+            # evaluate loss
+            batch_x, batch_y = batch
+            if self._custom_model_eval:  # e.g. used for sequences and other complex model evaluations
+                val_loss, model_output = self.loss(batch, self.model)
+            else:
+                model_output = self.model(batch_x)
+                val_loss = self.loss(model_output, batch_y)
+
+            # compute running validation loss and metrics. add 'val_' prefix to all measures.
+            running_val_loss += val_loss.item()
+            self._compute_running_metrics(model_output, batch, running_metrics, prefix='val_')
+        self.model.train()
+
+        # add loss to metrics and normalize all validation measures
+        running_metrics['val_loss'] = running_val_loss
+        for key, value in running_metrics.items():
+            if 'val_' not in key:
+                continue
+            running_metrics[key] = value / len(self.val_data_loader)
+
+    def _compute_running_metrics(self,
+                                 y_pred: torch.Tensor,
+                                 batch: Tuple[torch.Tensor, torch.Tensor],
+                                 running_metrics: dict,
+                                 prefix: str = ''):
+        """
+        Computes all metrics based on predictions and batches and adds them to the metrics
+        dictionary. Allows to prepend a prefix to the metric names in the dictionary.
+        """
+        for metric in self._metrics:
+            if self._custom_model_eval:
+                metric_result = metric(y_pred, batch)
+            else:
+                batch_y = batch[1]
+                metric_result = metric(y_pred, batch_y)
+
+            # convert to float if metric returned tensor
+            if type(metric_result) == torch.Tensor:
+                metric_result = metric_result.item()
+
+            running_metrics[prefix + metric.__name__] += metric_result
+
+    def _construct_performance_dict(self, train_step, running_batch_loss, running_metrics):
+        """
+        Constructs a combined dictionary of losses and metrics for callbacks based on
+        the current running averages.
+        """
+        performance_dict = defaultdict()
+        for key, value in running_metrics.items():
+            if 'val_' not in key:
+                performance_dict[key] = value / (train_step + 1.)
+            else:
+                performance_dict[key] = value  # validation metrics, already normalized
+
+        performance_dict['loss'] = running_batch_loss / (train_step + 1.)
+        return performance_dict
+
+    def _apply_callbacks(self, epoch, step, performance_measures):
+        """ Call all registered callbacks with current batch information. """
+        for callback in self._callbacks:
+            callback(epoch, step, performance_measures, self)
+
+    def _close_callbacks(self):
+        """ Signal callbacks training is finished. """
+        for callback in self._callbacks:
+            callback.close()
+
+    def _print_step_info(self, epoch, step, performance_measures):
+        """ Print running averages for loss and metrics during training. """
+        output_message = "epoch {}   batch {}/{}".format(epoch, step, len(self.train_data_loader) - 1)
+        delim = "   "
+        for metric_name in sorted(list(performance_measures.keys())):
+            if metric_name == 'gradient_norm':
+                continue
+            output_message += delim + "{}: {:.6f}".format(metric_name, performance_measures[metric_name])
+        LOGGER.info(output_message)
+
+    def _recursive_to_cuda(self, tensors):
+        """
+        Recursively iterates nested lists in depth-first order and transfers all tensors
+        to specified cuda device.
+        Parameters:
+            tensors (list or Tensor): list of tensors or tensor tuples, can be nested
+        """
+        if self._gpu is None:  # keep on cpu
+            return tensors
+
+        if type(tensors) != list:  # not only for torch.Tensor
+            return tensors.to(device=self._gpu)
+
+        for i in range(len(tensors)):
+            tensors[i] = self._recursive_to_cuda(tensors[i])
+        return tensors
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index a80c5c9..f895354 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -6,9 +6,9 @@
 
 from sonosco.common.constants import SONOSCO
 from sonosco.common.utils import setup_logging
-from sonosco.datasets.audio_dataset import AudioDataset, DataProcessor
-from sonosco.datasets.audio_data_sampler import BucketingSampler
-from sonosco.datasets.audio_data_loader import DataLoader
+from sonosco.datasets.audio_dataset import AudioDataset, AudioDataProcessor
+from sonosco.datasets.data_sampler import BucketingSampler
+from sonosco.datasets.data_loader import DataLoader
 from sonosco.datasets.download_datasets.librispeech import try_download_librispeech
 
 
@@ -44,7 +44,7 @@ def test_librispeech_clean(logger):
     # create data processor
     audio_conf = dict(sample_rate=SAMPLE_RATE, window_size=.02, window_stride=.01,
                       labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True)
-    processor = DataProcessor(**audio_conf)
+    processor = AudioDataProcessor(**audio_conf)
 
     # get manifest file
     manifest_directory = os.path.join(os.path.expanduser("~"), LIBRI_SPEECH_DIR)

From 7d6a3d598916962a5f5680ac00ea57af8178cecc Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Wed, 19 Jun 2019 17:49:11 +0200
Subject: [PATCH 46/58] add travis ci

---
 .travis.yaml                                          | 11 +++++++++++
 .../install_dependencies.sh                           |  0
 2 files changed, 11 insertions(+)
 create mode 100644 .travis.yaml
 rename install_dependencies.sh => scripts/install_dependencies.sh (100%)

diff --git a/.travis.yaml b/.travis.yaml
new file mode 100644
index 0000000..c1c8b03
--- /dev/null
+++ b/.travis.yaml
@@ -0,0 +1,11 @@
+language: python
+
+python:
+  - "3.6"
+
+install:
+  - bash scripts/install_dependencies.sh
+  - pip install -e .
+
+script:
+  - pytest
diff --git a/install_dependencies.sh b/scripts/install_dependencies.sh
similarity index 100%
rename from install_dependencies.sh
rename to scripts/install_dependencies.sh

From f18f303085b1d3cff2e3c75ce6902666a8106417 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Wed, 19 Jun 2019 19:02:43 +0200
Subject: [PATCH 47/58] update run_trainer

---
 sonosco/config/train.yaml             |  4 +--
 sonosco/config/train_librispeech.yaml | 49 +++++++++++++++++++++++++++
 sonosco/datasets/audio_dataset.py     | 12 ++-----
 sonosco/datasets/data_loader.py       | 14 ++++----
 sonosco/run_training.py               |  5 +--
 sonosco/training/trainer.py           |  1 +
 6 files changed, 65 insertions(+), 20 deletions(-)
 create mode 100644 sonosco/config/train_librispeech.yaml

diff --git a/sonosco/config/train.yaml b/sonosco/config/train.yaml
index 3e9cba9..3e8e364 100644
--- a/sonosco/config/train.yaml
+++ b/sonosco/config/train.yaml
@@ -3,7 +3,7 @@ train:
   val_manifest: 'examples/manifests/val_manifest.csv'
   labels_path: 'examples/labels.json' # Contains all characters for transcription
   log_dir: 'logs' # Location for log files
-  def_dir: 'examples/checkpoints/', # Default location to save/load models
+  def_dir: 'examples/checkpoints/' # Default location to save/load models
 
   load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune
 
@@ -19,7 +19,7 @@ train:
   labels: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' # labels used by the model
 
   max_epochs: 70 # Number of training epochs
-  learning_rate: 3e-4 # Initial learning rate
+  learning_rate: 3.0e-4 # Initial learning rate
   momentum: 0.9 # Momentum
   max_norm: 800 # Norm cutoff to prevent explosion of gradients
   learning_anneal: 1.1n # Annealing applied to learning rate every epoch
diff --git a/sonosco/config/train_librispeech.yaml b/sonosco/config/train_librispeech.yaml
new file mode 100644
index 0000000..eba936b
--- /dev/null
+++ b/sonosco/config/train_librispeech.yaml
@@ -0,0 +1,49 @@
+train:
+  train_manifest: '/Users/yuriy/temp/data/libri_speech/libri_test_clean_manifest.csv'
+  val_manifest: '/Users/yuriy/temp/data/libri_speech/libri_test_clean_manifest.csv'
+  log_dir: 'logs' # Location for log files
+  def_dir: 'examples/checkpoints/' # Default location to save/load models
+
+  load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune
+
+  sample_rate: 16000 # Sample rate
+  window_size: 0.02 # Window size for spectrogram in seconds
+  window_stride: 0.01 # Window stride for spectrogram in seconds
+  window: 'hamming' # Window type for spectrogram generation
+
+  batch_size: 32 # Batch size for training
+  hidden_size: 800 # Hidden size of RNNs
+  hidden_layers: 5 # Number of RNN layers
+  rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported
+  labels: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' # labels used by the model
+
+  max_epochs: 70 # Number of training epochs
+  learning_rate: 3.0e-4 # Initial learning rate
+  momentum: 0.9 # Momentum
+  max_norm: 800 # Norm cutoff to prevent explosion of gradients
+  learning_anneal: 1.1n # Annealing applied to learning rate every epoch
+  sortaGrad: True # Turn on ordering of dataset on sequence length for the first epoch
+
+  checkpoint: True # Enables checkpoint saving of model
+  checkpoint_per_epoch: 1 # Save checkpoint per x epochs
+  silent: False # Turn on progress tracking per iteration
+  verbose: False # Turn on verbose progress tracking
+  continue: False # Continue training with a pre-trained model
+  finetune: False # Finetune a pre-trained model
+
+  num_data_workers: 8 # Number of workers used in data-loading
+  augment: False # Use random tempo and gain perturbations
+  shuffle: True # Turn on shuffling and sample from dataset based on sequence length (smallest to largest)
+
+  seed: 123456 # Seed to generators
+  cuda: True # Use cuda to train model
+  half_precision: Trues # Uses half precision to train a model
+  apex: True # Uses mixed precision to train a model
+  static_loss_scaling: False # Static loss scale for mixed precision
+  dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision
+
+  dist_url: 'tcp://127.0.0.1:1550' # URL used to set up distributed training
+  dist_backend: 'nccl' # Distributed backend
+  world_size: 1 # Number of distributed processes
+  rank: 0 # The rank of the current process
+  gpu_rank: 0 # If using distributed parallel for multi_gpu, sets the GPU for the process
\ No newline at end of file
diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py
index c82e697..74683f0 100644
--- a/sonosco/datasets/audio_dataset.py
+++ b/sonosco/datasets/audio_dataset.py
@@ -24,7 +24,7 @@
 
 class AudioDataProcessor:
 
-    def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False):
+    def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False, **kwargs):
         """
         Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
         a comma. Each new line is a different sample. Example below:
@@ -76,13 +76,8 @@ def parse_audio(self, audio_path, raw=False):
         if raw:
             return sound
 
-        sound_tensor = torch.from_numpy(sound)
-
-        if global_settings.CUDA_ENABLED:
-            sound_tensor = sound_tensor.cuda()
-
         # TODO: comment why take the last element?
-        complex_spectrogram = librosa.stft(sound_tensor,
+        complex_spectrogram = librosa.stft(sound,
                                            n_fft=self.window_size_samples,
                                            hop_length=self.window_stride_samples,
                                            win_length=self.window_size_samples)
@@ -95,10 +90,9 @@ def parse_audio(self, audio_path, raw=False):
     def parse_transcript(self, transcript_path):
         with open(transcript_path, 'r', encoding='utf8') as transcript_file:
             transcript = transcript_file.read().replace('\n', '')
-            LOGGER.info(f"1: {transcript}")
         # TODO: Is it fast enough?
         transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
-        LOGGER.info(f"transcript_path: {transcript_path} transcript: {transcript}")
+        LOGGER.debug(f"transcript_path: {transcript_path} transcript: {transcript}")
         return transcript
 
 
diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/data_loader.py
index b9f1115..74912d6 100644
--- a/sonosco/datasets/data_loader.py
+++ b/sonosco/datasets/data_loader.py
@@ -39,21 +39,21 @@ def _collate_fn(self, batch):
         return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
 
 
-def create_data_loaders(train_manifest, val_manifest, batch_size, num_data_workers, **kwargs):
+def create_data_loaders(**kwargs):
     processor = AudioDataProcessor(**kwargs)
 
     # create train loader
-    train_dataset = AudioDataset(processor, manifest_filepath=train_manifest)
+    train_dataset = AudioDataset(processor, manifest_filepath=kwargs["train_manifest"])
     LOGGER.info(f"Training dataset containing {len(train_dataset)} samples is created")
-    sampler = BucketingSampler(train_dataset, batch_size=batch_size)
-    train_loader = AudioDataLoader(dataset=train_dataset, num_workers=num_data_workers, batch_sampler=sampler)
+    sampler = BucketingSampler(train_dataset, batch_size=kwargs["batch_size"])
+    train_loader = AudioDataLoader(dataset=train_dataset, num_workers=kwargs["num_data_workers"], batch_sampler=sampler)
     LOGGER.info("Training data loader created.")
 
     # create validation loader
-    val_dataset = AudioDataset(processor, manifest_filepath=val_manifest)
+    val_dataset = AudioDataset(processor, manifest_filepath=kwargs["val_manifest"])
     LOGGER.info(f"Validation dataset containing {len(val_dataset)} samples is created")
-    sampler = BucketingSampler(val_dataset, batch_size=batch_size)
-    val_loader = AudioDataLoader(dataset=val_dataset, num_workers=num_data_workers, batch_sampler=sampler)
+    sampler = BucketingSampler(val_dataset, batch_size=kwargs["batch_size"])
+    val_loader = AudioDataLoader(dataset=val_dataset, num_workers=kwargs["num_data_workers"], batch_sampler=sampler)
     LOGGER.info("Validation data loader created.")
 
     return train_loader, val_loader
diff --git a/sonosco/run_training.py b/sonosco/run_training.py
index 62222f7..b5485e3 100644
--- a/sonosco/run_training.py
+++ b/sonosco/run_training.py
@@ -14,10 +14,11 @@
 
 @click.command()
 @click.option("-e", "--experiment_name", default="default", type=click.STRING, help="Experiment name.")
-@click.option("-c", "--config_path", default="config/train.yaml", type=click.Path, help="Path to train configurations.")
+@click.option("-c", "--config_path", default="config/train.yaml", type=click.STRING,
+              help="Path to train configurations.")
 def main(experiment_name, config_path):
     Experiment.create(experiment_name)
-    config = parse_yaml(config_path)
+    config = parse_yaml(config_path)["train"]
 
     train_loader, val_loader = create_data_loaders(**config)
 
diff --git a/sonosco/training/trainer.py b/sonosco/training/trainer.py
index ba438fa..15841b7 100644
--- a/sonosco/training/trainer.py
+++ b/sonosco/training/trainer.py
@@ -96,6 +96,7 @@ def _epoch_step(self, epoch):
         running_metrics = defaultdict(float)
 
         for step, batch in enumerate(self.train_data_loader):
+            import pdb; pdb.set_trace()
             batch = self._recursive_to_cuda(batch)  # move to GPU
 
             # compute training batch

From 9fc9eef20f6c051fb430031f7c1b37f946c3ab7e Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Fri, 21 Jun 2019 15:03:00 +0200
Subject: [PATCH 48/58] Added serialize decorator

---
 .gitignore                    |  2 --
 sonosco/common/class_utils.py | 12 +++----
 sonosco/model.py              | 14 ++++++--
 sonosco/serialization.py      | 67 +++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+), 10 deletions(-)
 create mode 100644 sonosco/serialization.py

diff --git a/.gitignore b/.gitignore
index 453b1ff..385f777 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,6 @@
 sonosco/pycandle/
 sonosco/pycandle
 sonosco/experiments/
-sonosco/datasets/download_datasets/
-!sonosco/datasets/download_datasets/*.py
 
 **/.DS_Store
 
diff --git a/sonosco/common/class_utils.py b/sonosco/common/class_utils.py
index b73cafa..e4af560 100644
--- a/sonosco/common/class_utils.py
+++ b/sonosco/common/class_utils.py
@@ -1,23 +1,23 @@
 import inspect
-from typing import List
+from typing import Set
 
 
-def get_constructor_args(cls: type) -> List[str]:
+def get_constructor_args(cls) -> Set[str]:
     """
     E.g.
 
         class Bar():
                 def __init__(self, arg1, arg2):
 
-        get_constructor_args(BAR)
+        get_constructor_args(Bar)
         # returns ['arg1', 'arg2']
     Args:
-        cls (type):
+        cls (object):
 
-    Returns: list containing names of constructor arguments
+    Returns: set containing names of constructor arguments
 
     """
-    return inspect.getfullargspec(cls.__init__).args[1:]
+    return set(inspect.getfullargspec(cls.__init__).args[1:])
 
 
 def get_class_by_name(name: str) -> type:
diff --git a/sonosco/model.py b/sonosco/model.py
index 5bbbdea..6d12aec 100644
--- a/sonosco/model.py
+++ b/sonosco/model.py
@@ -80,11 +80,20 @@ def __init__(self, arg1, arg2):
 
 
         Args:
-            model (nn.Module): model to infre from
+            model (nn.Module): model to infer from
 
         Returns (dict): Mapping from __init__ argument to it's value
 
         """
+        constructor_args = get_constructor_args(model)
+        model_attributes = model.__dict__
+        attributes_names = set(model_attributes.keys())
+
+        ambiguous_arguments = constructor_args - attributes_names
+
+        if ambiguous_arguments:
+            LOGGER.warning(f"Some constructor arguments do not have equivalent fields ")
+
         return {}
 
 
@@ -145,11 +154,12 @@ def load_model(self, cls: type, path: str, deserialize_method_name: str = 'deser
         package = torch.load(path, map_location=lambda storage, loc: storage)
         if hasattr(cls, deserialize_method_name) and callable(getattr(cls, deserialize_method_name)):
             return getattr(cls, deserialize_method_name)(package)
-        constructor_args = set(get_constructor_args(cls))
+        constructor_args = get_constructor_args(cls)
         stored_keys = set(package.keys())
         stored_keys.remove('state_dict')
 
         args_to_apply = constructor_args & stored_keys
+        # If the lengths are not equal it means that there is some inconsistency between save and load
         if len(args_to_apply) != len(constructor_args):
             not_in_constructor = stored_keys - constructor_args
             if not_in_constructor:
diff --git a/sonosco/serialization.py b/sonosco/serialization.py
new file mode 100644
index 0000000..24b136b
--- /dev/null
+++ b/sonosco/serialization.py
@@ -0,0 +1,67 @@
+from dataclasses import _process_class, _create_fn, _set_new_attribute, fields, is_dataclass
+__primitives = {int, float, str, bool}
+__iterables = [list, set, tuple]
+
+def serializable(_cls=None):
+    """
+
+    Returns the same class as was passed in, with init and serialize methods.
+
+
+    Args:
+        _cls:
+
+    Returns:
+
+    """
+
+    def wrap(cls):
+        cls = _process_class(cls, init=True, repr=False, eq=False, order=False, unsafe_hash=False, frozen=False)
+        _set_new_attribute(cls, '__serialize__', __add_serialize(cls))
+        return cls
+
+    # See if we're being called as @dataclass or @dataclass().
+    if _cls is None:
+        # We're called with parens.
+        return wrap
+
+    # We're called as @dataclass without parens.
+    return wrap(_cls)
+
+
+def __add_serialize(cls):
+    fields_to_serialize = fields(cls)
+    sonosco_self = ['__sonosco_self__' if 'self' in fields_to_serialize else 'self']
+    serialize_body = __create_serialize_body(fields_to_serialize)
+    return _create_fn('__serialize__', [sonosco_self], [f'return {serialize_body}'], return_type=dict)
+
+
+def __create_serialize_body(fields_to_serialize):
+    body_lines = ["{"]
+    for field in fields_to_serialize:
+        if __is_primitive(field) or __is_iterable_of_primitives(field):
+            body_lines.append(__create_dict_entry(field.name, f"self.{field.name}"))
+        elif is_dataclass(field.type):
+            body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serlialize__()"))
+        else:
+            __throw_unsupported_data_type()
+    body_lines.append("}")
+    return body_lines
+
+
+def __is_iterable_of_primitives(field):
+    return field.__origin__ in __iterables and field.__args__[0] in __primitives
+
+
+def __throw_unsupported_data_type():
+    raise TypeError("Unsupported data type. Only primitives, lists of primitives, "
+                    "@serializable and @dataclass objects can be seralized")
+
+
+def __create_dict_entry(key, value):
+    return f'\'{key}\': {value},'
+
+
+def __is_primitive(obj):
+    return obj.type in __primitives
+

From f488e7369058bc0a6c13c6652d812cdb8e03c7d6 Mon Sep 17 00:00:00 2001
From: "w.jurasz" <w.jurasz@kigroup.de>
Date: Sun, 23 Jun 2019 00:20:44 +0200
Subject: [PATCH 49/58] Improced serialize to support torch.nn.Module

---
 sonosco/model.py                      |  52 ++----------
 sonosco/models/deepspeech2_sonosco.py | 118 ++++++++++++++++++++++++++
 sonosco/serialization.py              |  23 ++++-
 3 files changed, 144 insertions(+), 49 deletions(-)
 create mode 100644 sonosco/models/deepspeech2_sonosco.py

diff --git a/sonosco/model.py b/sonosco/model.py
index 6d12aec..c943244 100644
--- a/sonosco/model.py
+++ b/sonosco/model.py
@@ -1,10 +1,12 @@
 import logging
+
 import torch
 import deprecation
 import inspect
 import torch.nn as nn
 
 from common.class_utils import get_constructor_args, get_class_by_name
+from serialization import is_serializable
 
 LOGGER = logging.getLogger(__name__)
 
@@ -28,8 +30,7 @@ def save_model_simple(self, model: nn.Module, path: str) -> None:
         """
         torch.save(model, path)
 
-    def save_model(self, model: nn.Module, path: str, infer_structure: bool = False,
-                   serialize_method_name: str = 'serialize') -> None:
+    def save_model(self, model: nn.Module, path: str) -> None:
         """
         Saves the model using pickle protocol.
 
@@ -51,50 +52,11 @@ def save_model(self, model: nn.Module, path: str, infer_structure: bool = False,
         Returns:
 
         """
-        if infer_structure:
-            entity_to_save = self.get_constructor_args_with_values(model)
-            entity_to_save['state_dict'] = model.state_dict()
-        elif hasattr(model, serialize_method_name) and callable(getattr(model, serialize_method_name)):
-            entity_to_save = getattr(model, serialize_method_name)()
+        if is_serializable(model):
+            entity_to_save = model.__serialize__()
+            torch.save(entity_to_save, path)
         else:
-            entity_to_save = {'state_dict': model.state_dict()}
-
-        torch.save(entity_to_save, path)
-
-    @staticmethod
-    def get_constructor_args_with_values(model: nn.Module):
-        """
-        Assigns values to __init__ params names
-
-        For example:
-
-            class Bar():
-                def __init__(self, arg1, arg2):
-                    self.arg1 = arg1
-                    self.some_other_name = args2
-
-
-            bar = Bar("A","B")
-            get_constructor_args_with_values(bar)
-            # returns {arg1: arg1_val, arg2: arg2_val}
-
-
-        Args:
-            model (nn.Module): model to infer from
-
-        Returns (dict): Mapping from __init__ argument to it's value
-
-        """
-        constructor_args = get_constructor_args(model)
-        model_attributes = model.__dict__
-        attributes_names = set(model_attributes.keys())
-
-        ambiguous_arguments = constructor_args - attributes_names
-
-        if ambiguous_arguments:
-            LOGGER.warning(f"Some constructor arguments do not have equivalent fields ")
-
-        return {}
+            raise TypeError("Only @serializable class can be serialized")
 
 
 class Loader:
diff --git a/sonosco/models/deepspeech2_sonosco.py b/sonosco/models/deepspeech2_sonosco.py
new file mode 100644
index 0000000..1e1b1c3
--- /dev/null
+++ b/sonosco/models/deepspeech2_sonosco.py
@@ -0,0 +1,118 @@
+import math
+from collections import OrderedDict
+from dataclasses import field
+
+import torch
+from torch import nn
+
+from models.deepspeech2 import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, \
+    supported_rnns_inv
+from serialization import serializable
+
+
+@serializable
+class DeepSpeech2(nn.Module):
+    rnn_type: nn.RNNBase = nn.LSTM
+    labels: str = "abc"
+    rnn_hid_size: int = 768
+    nb_layers: int = 5
+    audio_conf: dict = field(default_factory={})
+    bidirectional: bool = True
+    version: str = '0.0.1'
+
+    def __post__init__(self):
+        sample_rate = self.audio_conf.get("sample_rate", 16000)
+        window_size = self.audio_conf.get("window_size", 0.02)
+        num_classes = len(self.labels)
+        self.conv = MaskConv(nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
+            nn.BatchNorm2d(32),
+            nn.Hardtanh(0, 20, inplace=True),
+            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
+            nn.BatchNorm2d(32),
+            nn.Hardtanh(0, 20, inplace=True)
+        ))
+        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
+        rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1)
+        rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1)
+        rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1)
+        rnn_in_size *= 32
+
+        rnns = [('0', BatchRNN(input_size=rnn_in_size, hidden_size=self.rnn_hid_size, rnn_type=self.rnn_type, batch_norm=False))]
+        rnns.extend([(f"{x + 1}", BatchRNN(input_size=self.rnn_hid_size, hidden_size=self.rnn_hid_size, rnn_type=self.rnn_type))
+                     for x in range(self.nb_layers - 1)])
+        self.rnns = nn.Sequential(OrderedDict(rnns))
+
+        fully_connected = nn.Sequential(
+            nn.BatchNorm1d(self.rnn_hid_size),
+            nn.Linear(self.rnn_hid_size, num_classes, bias=False)
+        )
+
+        self.fc = nn.Sequential(
+            SequenceWise(fully_connected),
+        )
+
+        self.inference_softmax = InferenceBatchSoftmax()
+
+    def forward(self, x, lengths):
+        # if x.is_cuda and self.mixed_precision:
+        #     x = x.half()
+        lengths = lengths.cpu().int()
+        output_lengths = self.get_seq_lens(lengths)
+        x, _ = self.conv(x, output_lengths)
+
+        sizes = x.size()
+        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
+        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
+
+        for rnn in self.rnns:
+            x = rnn(x, output_lengths)
+
+        if not self.bidirectional:  # no need for lookahead layer in bidirectional
+            x = self.lookahead(x)
+
+        x = self.fc(x)
+        x = x.transpose(0, 1)
+        # identity in training mode, softmax in eval mode
+        x = self.inference_softmax(x)
+        return x, output_lengths
+
+    def get_seq_lens(self, input_length):
+        """
+        Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
+        containing the size sequences that will be output by the network.
+        :param input_length: 1D Tensor
+        :return: 1D Tensor scaled by model
+        """
+        seq_len = input_length
+        for m in self.conv.modules():
+            if type(m) == nn.modules.conv.Conv2d:
+                seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
+        return seq_len.int()
+
+    @staticmethod
+    def get_param_size(model):
+        params = 0
+        for p in model.parameters():
+            tmp = 1
+            for x in p.size():
+                tmp *= x
+            params += tmp
+        return params
+
+    def __repr__(self):
+        rep = f"DeepSpeech2 version: {self.version}\n" + \
+              "=======================================" + \
+              "Recurrent Neural Network Properties\n" + \
+              f"  RNN Type:  \t{self.rnn_type.__name__.lower()}\n" + \
+              f"  RNN Layers:\t{self.hidden_layers}\n" + \
+              f"  RNN Size:  \t{self.hidden_size}\n" + \
+              f"  Classes:   \t{len(self.labels)}\n" + \
+              "---------------------------------------\n" + \
+              "Model Features\n" + \
+              f"  Labels:       \t{self.labels}\n" + \
+              f"  Sample Rate:  \t{self.audio_conf.get('sample_rate', 'n/a')}\n" + \
+              f"  Window Type:  \t{self.audio_conf.get('window', 'n/a')}\n" + \
+              f"  Window Size:  \t{self.audio_conf.get('window_size', 'n/a')}\n" + \
+              f"  Window Stride:\t{self.audio_conf.get('window_stride', 'n/a')}"
+        return rep
diff --git a/sonosco/serialization.py b/sonosco/serialization.py
index 24b136b..cae248e 100644
--- a/sonosco/serialization.py
+++ b/sonosco/serialization.py
@@ -29,33 +29,46 @@ def wrap(cls):
     return wrap(_cls)
 
 
+def is_serializable(obj):
+    return hasattr(obj, '__serialize__')
+
 def __add_serialize(cls):
     fields_to_serialize = fields(cls)
     sonosco_self = ['__sonosco_self__' if 'self' in fields_to_serialize else 'self']
-    serialize_body = __create_serialize_body(fields_to_serialize)
+    serialize_body = __create_serialize_body(cls, fields_to_serialize)
     return _create_fn('__serialize__', [sonosco_self], [f'return {serialize_body}'], return_type=dict)
 
 
-def __create_serialize_body(fields_to_serialize):
+def __create_serialize_body(cls, fields_to_serialize):
     body_lines = ["{"]
     for field in fields_to_serialize:
         if __is_primitive(field) or __is_iterable_of_primitives(field):
             body_lines.append(__create_dict_entry(field.name, f"self.{field.name}"))
         elif is_dataclass(field.type):
             body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serlialize__()"))
+        elif __is_nn_class(field.type):
+            body_lines.append("'{}': {".format(field.name))
+            __extract_from_nn(cls, body_lines)
+            body_lines.append("}")
         else:
             __throw_unsupported_data_type()
+    body_lines.append(__create_dict_entry("state_dict", "self.state_dict()"))
     body_lines.append("}")
     return body_lines
 
+def __extract_from_nn(cls, body_lines):
+    constants = list(filter(lambda el: not el.startswith('_'), cls.__constants__))
+    for constant in constants:
+        body_lines.append(__create_dict_entry(constant, f"self.{constant}"))
+
 
 def __is_iterable_of_primitives(field):
     return field.__origin__ in __iterables and field.__args__[0] in __primitives
 
 
 def __throw_unsupported_data_type():
-    raise TypeError("Unsupported data type. Only primitives, lists of primitives, "
-                    "@serializable and @dataclass objects can be seralized")
+    raise TypeError("Unsupported data type. Only primitives, lists of primitives, torch.nn.Module"
+                    "@serializable and @dataclass objects can be serialized")
 
 
 def __create_dict_entry(key, value):
@@ -65,3 +78,5 @@ def __create_dict_entry(key, value):
 def __is_primitive(obj):
     return obj.type in __primitives
 
+def __is_nn_class(cls):
+    return hasattr(cls, '__constants__')
\ No newline at end of file

From fc1431f8eaf0ed0925848116be30714c7f6468b2 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Sun, 23 Jun 2019 15:02:15 +0200
Subject: [PATCH 50/58] fix deepspeech2 and first successful training run

---
 sonosco/common/path_utils.py          |  2 +-
 sonosco/config/train_librispeech.yaml |  2 +-
 sonosco/datasets/data_loader.py       |  6 +++---
 sonosco/models/deepspeech2.py         | 12 +++++++++---
 sonosco/run_training.py               | 12 +++++++++---
 sonosco/training/trainer.py           | 13 +++++--------
 6 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/sonosco/common/path_utils.py b/sonosco/common/path_utils.py
index 03321bb..2199b8f 100644
--- a/sonosco/common/path_utils.py
+++ b/sonosco/common/path_utils.py
@@ -16,4 +16,4 @@ def try_download(destination: str, url: str):
 
 def parse_yaml(file_path: str):
     with codecs.open(file_path, "r", "utf-8") as file:
-        return yaml.load(file)
+        return yaml.load(file, Loader=yaml.FullLoader)
diff --git a/sonosco/config/train_librispeech.yaml b/sonosco/config/train_librispeech.yaml
index eba936b..64d6294 100644
--- a/sonosco/config/train_librispeech.yaml
+++ b/sonosco/config/train_librispeech.yaml
@@ -37,7 +37,7 @@ train:
 
   seed: 123456 # Seed to generators
   cuda: True # Use cuda to train model
-  half_precision: Trues # Uses half precision to train a model
+  half_precision: True # Uses half precision to train a model
   apex: True # Uses mixed precision to train a model
   static_loss_scaling: False # Static loss scale for mixed precision
   dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision
diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/data_loader.py
index 74912d6..79ed208 100644
--- a/sonosco/datasets/data_loader.py
+++ b/sonosco/datasets/data_loader.py
@@ -24,7 +24,7 @@ def _collate_fn(self, batch):
         freq_size, max_seqlength = longest_sample.size()
         minibatch_size = len(batch)
         inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
-        input_percentages = torch.FloatTensor(minibatch_size)
+        input_lengths = torch.IntTensor(minibatch_size)
         target_sizes = np.zeros(minibatch_size, dtype=np.int32)
 
         # TODO: Numpy broadcasting magic
@@ -32,11 +32,11 @@ def _collate_fn(self, batch):
 
         for x in range(minibatch_size):
             inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
-            input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
+            input_lengths[x] = batch[x][0].size(1)
             target_sizes[x] = len(batch[x][1])
             targets.extend(batch[x][1])
 
-        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
+        return inputs, torch.IntTensor(targets), input_lengths, torch.from_numpy(target_sizes)
 
 
 def create_data_loaders(**kwargs):
diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py
index dcfa100..35645d2 100644
--- a/sonosco/models/deepspeech2.py
+++ b/sonosco/models/deepspeech2.py
@@ -7,10 +7,13 @@
 from collections import OrderedDict
 
 import torch
+import logging
 import torch.nn as nn
 import torch.nn.functional as F
 
 
+LOGGER = logging.getLogger(__name__)
+
 supported_rnns = {
     'lstm': nn.LSTM,
     'rnn': nn.RNN,
@@ -82,13 +85,14 @@ def forward(self, input_):
 
 
 class BatchRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True):
+    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=False):
         super(BatchRNN, self).__init__()
+        self.bidirectional = bidirectional
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
         self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
-                            bidirectional=True, bias=True)
+                            bidirectional=bidirectional, bias=True)
 
     def flatten_parameters(self):
         self.rnn.flatten_parameters()
@@ -134,7 +138,8 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5
             nn.Hardtanh(0, 20, inplace=True)
         ))
         # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
-        rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1)
+        rnn_in_size = int(math.floor((sample_rate * window_size) / 4) + 1)
+        LOGGER.debug(f"Initial calculated feature size: {rnn_in_size}")
         rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1)
         rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1)
         rnn_in_size *= 32
@@ -158,6 +163,7 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5
     def forward(self, x, lengths):
         # if x.is_cuda and self.mixed_precision:
         #     x = x.half()
+        LOGGER.debug(f"Actual initial size: {x.size()}")
         lengths = lengths.cpu().int()
         output_lengths = self.get_seq_lens(lengths)
         x, _ = self.conv(x, output_lengths)
diff --git a/sonosco/run_training.py b/sonosco/run_training.py
index b5485e3..76e0a4e 100644
--- a/sonosco/run_training.py
+++ b/sonosco/run_training.py
@@ -22,12 +22,18 @@ def main(experiment_name, config_path):
 
     train_loader, val_loader = create_data_loaders(**config)
 
+    def custom_loss(batch, model):
+        batch_x, batch_y, input_lengths, target_lengths = batch
+        model_output, output_lengths = model(batch_x, input_lengths)
+        loss = torch_functional.ctc_loss(model_output.transpose(0, 1), batch_y, output_lengths, target_lengths)
+        return loss, model_output
+
     # TODO: change to load different models dynamically
-    model = DeepSpeech2()
+    model = DeepSpeech2(labels=config["labels"])
 
-    trainer = ModelTrainer(model, loss=torch_functional.ctc_loss, epochs=config["max_epochs"],
+    trainer = ModelTrainer(model, loss=custom_loss, epochs=config["max_epochs"],
                            train_data_loader=train_loader, val_data_loader=val_loader,
-                           lr=config["learning_rate"])
+                           lr=config["learning_rate"], custom_model_eval=True)
 
     try:
         trainer.start_training()
diff --git a/sonosco/training/trainer.py b/sonosco/training/trainer.py
index 15841b7..7c9bd83 100644
--- a/sonosco/training/trainer.py
+++ b/sonosco/training/trainer.py
@@ -26,14 +26,11 @@ class ModelTrainer:
         custom_model_eval (boolean, optional): enables training mode where the model is evaluated in the loss function
         gpu (int, optional): if not set training runs on cpu, otherwise an int is expected that determines the training gpu
         clip_grads (float, optional): if set training gradients will be clipped at specified norm
-    Example:
-        >>> model_trainer = ModelTrainer(model, optimizer, F.nll_loss, num_epochs, train_loader, gpu=0)
-        >>> model_trainer.start_training()
     """
 
     def __init__(self,
                  model: torch.nn.Module,
-                 loss: Union[Callable[[torch.Tensor, torch.Tensor], float],
+                 loss: Union[Callable[[Any, Any], Any],
                              Callable[[torch.Tensor, torch.Tensor, torch.nn.Module], float]],
                  epochs: int,
                  train_data_loader: DataLoader,
@@ -95,8 +92,8 @@ def _epoch_step(self, epoch):
         running_batch_loss = 0
         running_metrics = defaultdict(float)
 
-        for step, batch in enumerate(self.train_data_loader):
-            import pdb; pdb.set_trace()
+        for step, (batch_x, batch_y, input_lengths, target_lengths) in enumerate(self.train_data_loader):
+            batch = (batch_x, batch_y, input_lengths, target_lengths)
             batch = self._recursive_to_cuda(batch)  # move to GPU
 
             # compute training batch
@@ -131,11 +128,11 @@ def _comp_gradients(self):
     def _train_on_batch(self, batch):
         """ Compute loss depending on settings, compute gradients and apply optimization step. """
         # evaluate loss
-        batch_x, batch_y = batch
+        batch_x, batch_y, input_lengths, target_lengths = batch
         if self._custom_model_eval:
             loss, model_output = self.loss(batch, self.model)
         else:
-            model_output = self.model(batch_x)
+            model_output = self.model(batch_x, input_lengths)
             loss = self.loss(model_output, batch_y)
 
         self.optimizer.zero_grad()  # reset gradients

From 57caebc0280db70ef7ca4ff44ffd6b4aba9014ed Mon Sep 17 00:00:00 2001
From: ga38nif <florian.lay@tum.de>
Date: Sun, 23 Jun 2019 15:27:30 +0200
Subject: [PATCH 51/58] rework audio dataloader and make it nicer

---
 sonosco/datasets/audio_data_loader.py | 40 ++++++++++++++-------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/sonosco/datasets/audio_data_loader.py b/sonosco/datasets/audio_data_loader.py
index 967b89c..0c44177 100644
--- a/sonosco/datasets/audio_data_loader.py
+++ b/sonosco/datasets/audio_data_loader.py
@@ -7,27 +7,29 @@
 class AudioDataLoader(DataLoader):
 
     def __init__(self, *args, **kwargs):
-        """Creates a data loader for AudioDatasets."""
+        '''
+        Creates a data loader for AudioDatasets.
+        '''
         super(AudioDataLoader, self).__init__(*args, **kwargs)
         self.collate_fn = self._collate_fn
 
-    # TODO: Optimise
     def _collate_fn(self, batch):
+        #sort the batch in decreasing order of sequence length
         batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
-        longest_sample = batch[0][0]
-        freq_size, max_seqlength = longest_sample.size()
-        minibatch_size = len(batch)
-        inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
-        input_percentages = torch.FloatTensor(minibatch_size)
-        target_sizes = np.zeros(minibatch_size, dtype=np.int32)
-
-        # TODO: Numpy broadcasting magic
-        targets = []
-
-        for x in range(minibatch_size):
-            inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
-            input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
-            target_sizes[x] = len(batch[x][1])
-            targets.extend(batch[x][1])
-
-        return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
\ No newline at end of file
+
+        #pad the tensors to have equal lengths, therefore transpose the tensors in
+        #the batch. The tensors have shape: freq_size x sequence_length
+        #and need to be of shape: sequence_length x  freq_length, as sequence length differs
+        #but not the freq_length
+        inputs = torch.nn.utils.rnn.pad_sequence(list(map(lambda x: x[0].transpose(0,1), batch)), batch_first=True)
+
+        #inputs need to be transposed back from shape batch_size x sequence_length x  freq_length
+        #to batch_size x freq_length x sequence_length. Additionally, unsqueeze tensor
+        inputs = inputs.transpose(1,2).unsqueeze(1)
+        input_lengths = torch.IntTensor(list(map(lambda x: x[0].size(1), batch))) #create tensor of input lengths
+
+        targets_arr = list(zip(*batch))[1] #extract targets array from batch ( batch is array of tuples)
+        target_lengths = torch.IntTensor(list(map(lambda x: len(x),targets_arr))) #create tensor of target lengths
+        targets = torch.cat(list(map(lambda x: torch.IntTensor(x), targets_arr))) #create tensor of targets
+
+        return inputs, targets, input_lengths, target_lengths

From e3e21d88d8ace89f47c8312ede5e3e621c23d43b Mon Sep 17 00:00:00 2001
From: Yuriy Arabskyy <yuriy.arabskyy@gmail.com>
Date: Mon, 24 Jun 2019 14:28:19 +0200
Subject: [PATCH 52/58] Fix typo

---
 sonosco/serialization.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sonosco/serialization.py b/sonosco/serialization.py
index cae248e..9f905b6 100644
--- a/sonosco/serialization.py
+++ b/sonosco/serialization.py
@@ -2,6 +2,7 @@
 __primitives = {int, float, str, bool}
 __iterables = [list, set, tuple]
 
+
 def serializable(_cls=None):
     """
 
@@ -32,6 +33,7 @@ def wrap(cls):
 def is_serializable(obj):
     return hasattr(obj, '__serialize__')
 
+
 def __add_serialize(cls):
     fields_to_serialize = fields(cls)
     sonosco_self = ['__sonosco_self__' if 'self' in fields_to_serialize else 'self']
@@ -45,7 +47,7 @@ def __create_serialize_body(cls, fields_to_serialize):
         if __is_primitive(field) or __is_iterable_of_primitives(field):
             body_lines.append(__create_dict_entry(field.name, f"self.{field.name}"))
         elif is_dataclass(field.type):
-            body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serlialize__()"))
+            body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serialize__()"))
         elif __is_nn_class(field.type):
             body_lines.append("'{}': {".format(field.name))
             __extract_from_nn(cls, body_lines)
@@ -56,6 +58,7 @@ def __create_serialize_body(cls, fields_to_serialize):
     body_lines.append("}")
     return body_lines
 
+
 def __extract_from_nn(cls, body_lines):
     constants = list(filter(lambda el: not el.startswith('_'), cls.__constants__))
     for constant in constants:
@@ -78,5 +81,6 @@ def __create_dict_entry(key, value):
 def __is_primitive(obj):
     return obj.type in __primitives
 
+
 def __is_nn_class(cls):
-    return hasattr(cls, '__constants__')
\ No newline at end of file
+    return hasattr(cls, '__constants__')

From 341e32abb89f53f63b6faa1ce6499c90b5fee96f Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Mon, 24 Jun 2019 22:47:51 +0200
Subject: [PATCH 53/58] resolve comments regarding noise makers

---
 .gitignore                     |  2 --
 setup.py                       |  4 ++--
 sonosco/common/audio_tools.py  | 19 ++-----------------
 sonosco/common/noise_makers.py | 18 ++++++++++++++++++
 4 files changed, 22 insertions(+), 21 deletions(-)
 create mode 100644 sonosco/common/noise_makers.py

diff --git a/.gitignore b/.gitignore
index dcee099..58fc0df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,8 +2,6 @@
 sonosco/pycandle/
 experiments/
 tests/test_wavs/
-sonosco/pycandle
-sonosco/experiments/
 
 **/.DS_Store
 
diff --git a/setup.py b/setup.py
index 207c04e..4a1ac9b 100644
--- a/setup.py
+++ b/setup.py
@@ -2,8 +2,8 @@
 
 setup(
     name="sonosco",
-    description="Framework for training automatic speech recognition systems.",
-    author="The Roboy Gang",
+    description="Framework for deep automatic speech recognition systems.",
+    author="Roboy",
     packages=["sonosco"],
     include_package_data=True,
     dependency_links=[]
diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
index 9ab77ab..807e79b 100644
--- a/sonosco/common/audio_tools.py
+++ b/sonosco/common/audio_tools.py
@@ -2,6 +2,8 @@
 import numpy as np
 import librosa
 
+from .noise_makers import NoiseMaker, GaussianNoiseMaker
+
 
 def get_duration(file_path):
     return float(subprocess.check_output([f'soxi -D "{file_path.strip()}"'], shell=True))
@@ -19,23 +21,6 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_
     subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True)
 
 
-class NoiseMaker:
-
-    def __call__(self, audio):
-        """Adds noise to the audio signal."""
-        pass
-
-
-class GaussianNoiseMaker(NoiseMaker):
-
-    def __init__(self, std=0.002):
-        self.std = std
-
-    def __call__(self, audio):
-        noise = np.random.randn(len(audio))
-        return audio + self.std * noise
-
-
 def add_noise(audio, noise_maker: NoiseMaker = None):
     if noise_maker is None:
         noise_maker = GaussianNoiseMaker()
diff --git a/sonosco/common/noise_makers.py b/sonosco/common/noise_makers.py
new file mode 100644
index 0000000..551e2ec
--- /dev/null
+++ b/sonosco/common/noise_makers.py
@@ -0,0 +1,18 @@
+import numpy as np
+
+
+class NoiseMaker:
+
+    def __call__(self, audio):
+        """Adds noise to the audio signal."""
+        pass
+
+
+class GaussianNoiseMaker(NoiseMaker):
+
+    def __init__(self, std=0.002):
+        self.std = std
+
+    def __call__(self, audio):
+        noise = np.random.randn(len(audio))
+        return audio + self.std * noise

From 37d9a1c6b3bc4e2bd84553bd931f100458f99b38 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Mon, 24 Jun 2019 23:43:48 +0200
Subject: [PATCH 54/58] resolve comments

---
 .gitignore                                    |   1 -
 requirements.txt                              |   1 +
 sonosco/datasets/__init__.py                  |   6 +-
 sonosco/datasets/audio_dataset.py             | 137 ------------------
 sonosco/datasets/dataset.py                   |  53 +++++++
 sonosco/datasets/download_datasets/an4.py     |  12 +-
 .../download_datasets/common_voice.py         |   5 +-
 .../download_datasets/merge_manifests.py      |   7 +-
 .../datasets/{data_loader.py => loader.py}    |   4 +-
 .../datasets/{data_sampler.py => samplers.py} |   2 +-
 tests/test_dataset.py                         |   6 +-
 11 files changed, 76 insertions(+), 158 deletions(-)
 delete mode 100644 sonosco/datasets/audio_dataset.py
 create mode 100644 sonosco/datasets/dataset.py
 rename sonosco/datasets/{data_loader.py => loader.py} (96%)
 rename sonosco/datasets/{data_sampler.py => samplers.py} (97%)

diff --git a/.gitignore b/.gitignore
index 58fc0df..9717b09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 # Created by .ignore support plugin (hsz.mobi)
-sonosco/pycandle/
 experiments/
 tests/test_wavs/
 
diff --git a/requirements.txt b/requirements.txt
index ca4f802..8ae1526 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,3 +23,4 @@ wget==3.2
 pytest==4.6.3
 click==7.0
 deprecation==2.0.6
+dataclasses==0.6
diff --git a/sonosco/datasets/__init__.py b/sonosco/datasets/__init__.py
index 9cfa7a3..b74d5b6 100644
--- a/sonosco/datasets/__init__.py
+++ b/sonosco/datasets/__init__.py
@@ -1,3 +1,3 @@
-from .audio_dataset import AudioDataProcessor, AudioDataset
-from .data_sampler import BucketingSampler
-from .data_loader import AudioDataLoader, create_data_loaders
+from .dataset import AudioDataProcessor, AudioDataset
+from .samplers import BucketingSampler
+from .loader import AudioDataLoader, create_data_loaders
diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py
deleted file mode 100644
index 74683f0..0000000
--- a/sonosco/datasets/audio_dataset.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# ----------------------------------------------------------------------------
-# Based on SeanNaren's deepspeech.pytorch:
-# https://github.com/SeanNaren/deepspeech.pytorch
-# ----------------------------------------------------------------------------
-
-import logging
-import torch
-import librosa
-import numpy as np
-import sonosco.config.global_settings as global_settings
-import sonosco.common.audio_tools as audio_tools
-import sonosco.common.utils as utils
-
-from torch.utils.data import Dataset
-
-
-LOGGER = logging.getLogger(__name__)
-MIN_STRETCH = 0.7
-MAX_STRETCH = 1.3
-MIN_PITCH = 0.7
-MAX_PITCH = 1.5
-MAX_SHIFT = 4000
-
-
-class AudioDataProcessor:
-
-    def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False, **kwargs):
-        """
-        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
-        a comma. Each new line is a different sample. Example below:
-        /path/to/audio.wav,/path/to/audio.txt
-        ...
-        :param window_stride: number of seconds to skip between each window
-        :param window_size: number of seconds to use for a window of spectrogram
-        :param sample_rate: sample rate of the recordings
-        :param labels: string containing all the possible characters to map to
-        :param normalize: apply standard mean and deviation normalization to audio tensor
-        :param augment(default False): apply random tempo and gain perturbations
-        """
-        self.window_stride = window_stride
-        self.window_size = window_size
-        self.sample_rate = sample_rate
-        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
-        self.normalize = normalize
-        self.augment = augment
-
-    @property
-    def window_stride_samples(self):
-        return int(self.sample_rate * self.window_stride)
-
-    @property
-    def window_size_samples(self):
-        return int(self.sample_rate * self.window_stride)
-
-    def retrieve_file(self, audio_path):
-        sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate)
-        return sound, sample_rate
-
-    def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True):
-        augmented = audio_tools.stretch(sound, utils.random_float(MIN_STRETCH, MAX_STRETCH)) if stretch else sound
-        augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented
-        augmented = audio_tools.pitch_shift(augmented, self.sample_rate,
-                                            n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented
-        augmented = audio_tools.add_noise(augmented) if noise else augmented
-        return augmented
-
-    def parse_audio(self, audio_path, raw=False):
-        sound, sample_rate = self.retrieve_file(audio_path)
-
-        if sample_rate != self.sample_rate:
-            raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
-
-        if self.augment:
-            sound = self.augment_audio(sound)
-
-        if raw:
-            return sound
-
-        # TODO: comment why take the last element?
-        complex_spectrogram = librosa.stft(sound,
-                                           n_fft=self.window_size_samples,
-                                           hop_length=self.window_stride_samples,
-                                           win_length=self.window_size_samples)
-        spectrogram, phase = librosa.magphase(complex_spectrogram)
-        # S = log(S+1)
-        spectrogram = torch.from_numpy(np.log1p(spectrogram))
-
-        return spectrogram
-
-    def parse_transcript(self, transcript_path):
-        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
-            transcript = transcript_file.read().replace('\n', '')
-        # TODO: Is it fast enough?
-        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
-        LOGGER.debug(f"transcript_path: {transcript_path} transcript: {transcript}")
-        return transcript
-
-
-class AudioDataset(Dataset):
-
-    def __init__(self, processor: AudioDataProcessor, manifest_filepath):
-        """
-        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
-        a comma. Each new line is a different sample. Example below:
-        /path/to/audio.wav,/path/to/audio.txt
-        ...
-        :param processor: Data processor object
-        :param manifest_filepath: Path to manifest csv as describe above
-        """
-        super().__init__()
-        with open(manifest_filepath) as f:
-            ids = f.readlines()
-        ids = [x.strip().split(',') for x in ids]
-        self.ids = ids
-        self.size = len(ids)
-        self.processor = processor
-
-    def get_raw(self, index):
-        sample = self.ids[index]
-        audio_path, transcript_path = sample[0], sample[1]
-
-        sound = self.processor.parse_audio(audio_path, raw=True)
-        transcript = self.processor.parse_transcript(transcript_path)
-
-        return sound, transcript
-
-    def __getitem__(self, index):
-        sample = self.ids[index]
-        audio_path, transcript_path = sample[0], sample[1]
-
-        spectrogram = self.processor.parse_audio(audio_path)
-        transcript = self.processor.parse_transcript(transcript_path)
-
-        return spectrogram, transcript
-
-    def __len__(self):
-        return self.size
diff --git a/sonosco/datasets/dataset.py b/sonosco/datasets/dataset.py
new file mode 100644
index 0000000..8345f30
--- /dev/null
+++ b/sonosco/datasets/dataset.py
@@ -0,0 +1,53 @@
+# ----------------------------------------------------------------------------
+# Based on SeanNaren's deepspeech.pytorch:
+# https://github.com/SeanNaren/deepspeech.pytorch
+# ----------------------------------------------------------------------------
+
+import logging
+
+from torch.utils.data import Dataset
+from .processor import AudioDataProcessor
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class AudioDataset(Dataset):
+
+    def __init__(self, processor: AudioDataProcessor, manifest_filepath):
+        """
+        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
+        a comma. Each new line is a different sample. Example below:
+        /path/to/audio.wav,/path/to/audio.txt
+        ...
+        :param processor: Data processor object
+        :param manifest_filepath: Path to manifest csv as describe above
+        """
+        super().__init__()
+        with open(manifest_filepath) as f:
+            ids = f.readlines()
+        ids = [x.strip().split(',') for x in ids]
+        self.ids = ids
+        self.size = len(ids)
+        self.processor = processor
+
+    def get_raw(self, index):
+        sample = self.ids[index]
+        audio_path, transcript_path = sample[0], sample[1]
+
+        sound = self.processor.parse_audio(audio_path, raw=True)
+        transcript = self.processor.parse_transcript(transcript_path)
+
+        return sound, transcript
+
+    def __getitem__(self, index):
+        sample = self.ids[index]
+        audio_path, transcript_path = sample[0], sample[1]
+
+        spectrogram = self.processor.parse_audio(audio_path)
+        transcript = self.processor.parse_transcript(transcript_path)
+
+        return spectrogram, transcript
+
+    def __len__(self):
+        return self.size
diff --git a/sonosco/datasets/download_datasets/an4.py b/sonosco/datasets/download_datasets/an4.py
index 1d4368b..374cb5b 100644
--- a/sonosco/datasets/download_datasets/an4.py
+++ b/sonosco/datasets/download_datasets/an4.py
@@ -15,6 +15,7 @@
 
 AN4_URL = 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz'
 
+
 def try_download_an4(target_dir, sample_rate, min_duration, max_duration):
     path_to_data = os.path.join(os.path.expanduser("~"), target_dir)
     if not os.path.exists(path_to_data):
@@ -44,6 +45,7 @@ def try_download_an4(target_dir, sample_rate, min_duration, max_duration):
     create_manifest(path_to_data, os.path.join(path_to_data,'an4_train_manifest.csv'), min_duration, max_duration)
     create_manifest(path_to_data, os.path.join(path_to_data,'an4_val_manifest.csv'), min_duration, max_duration)
 
+
 def create_wav_and_transcripts(path, data_tag, sample_rate, extracted_dir, wav_subfolder_name):
     tag_path = os.path.join(path,data_tag)
     transcript_path_new = os.path.join(tag_path, 'txt')
@@ -59,6 +61,7 @@ def create_wav_and_transcripts(path, data_tag, sample_rate, extracted_dir, wav_s
     convert_audio_to_wav(path, sample_rate)
     format_files(file_ids, transcript_path_new, wav_path_new, transcripts_ext, wav_path_ext)
 
+
 def convert_audio_to_wav(train_path, sample_rate):
     with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe:
         for line in pipe:
@@ -88,6 +91,7 @@ def _process_transcript(transcripts, x):
     extracted_transcript = transcripts[x].split('(')[0].strip("<s>").split('<')[0].strip().upper()
     return extracted_transcript
 
+
 @click.command()
 @click.option("--target-dir", default="temp/data/an4", type=str, help="Directory to store the dataset.")
 @click.option("--sample-rate", default=16000, type=int, help="Sample rate.")
@@ -95,14 +99,12 @@ def _process_transcript(transcripts, x):
               help="Prunes training samples shorter than the min duration (given in seconds).")
 @click.option("--max-duration", default=15, type=int,
               help="Prunes training samples longer than the max duration (given in seconds).")
-
 def main(**kwargs):
     """Processes and downloads an4 dataset."""
-    global LOGGER
-    logger = logging.getLogger(SONOSCO)
-    setup_logging(logger)
     try_download_an4(**kwargs)
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    LOGGER = logging.getLogger(SONOSCO)
+    setup_logging(LOGGER)
+    main()
diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py
index 4dfc078..41bc102 100644
--- a/sonosco/datasets/download_datasets/common_voice.py
+++ b/sonosco/datasets/download_datasets/common_voice.py
@@ -56,7 +56,6 @@ def try_download_common_voice(target_dir, sample_rate, files_to_use, min_duratio
                         max_duration)
 
 
-
 def convert_to_wav(csv_file, target_dir, sample_rate):
     """ Read *.csv file description, convert mp3 to wav, process text.
         Save results to target_dir.
@@ -76,7 +75,9 @@ def process(x):
         text = text.strip().upper()
         with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f:
             f.write(text)
-        audio_tools(source = os.path.join(path_to_data, file_path), destination=os.path.join(wav_dir, file_name + '.wav'), sample_rate = sample_rate)
+        audio_tools.transcode_recording(source=os.path.join(path_to_data, file_path),
+                                        destination=os.path.join(wav_dir, file_name + '.wav'),
+                                        sample_rate=sample_rate)
 
     LOGGER.info('Converting mp3 to wav for {}.'.format(csv_file))
     with open(csv_file) as csvfile:
diff --git a/sonosco/datasets/download_datasets/merge_manifests.py b/sonosco/datasets/download_datasets/merge_manifests.py
index e5e0fab..6218d52 100644
--- a/sonosco/datasets/download_datasets/merge_manifests.py
+++ b/sonosco/datasets/download_datasets/merge_manifests.py
@@ -1,11 +1,10 @@
-from __future__ import print_function
-
 import argparse
 import io
 import os
 
 from tqdm import tqdm
-from utils import order_and_prune_files
+from .data_utils import order_and_prune_files
+
 
 parser = argparse.ArgumentParser(description='Merges all manifest CSV files in specified folder.')
 parser.add_argument('--merge-dir', default='manifests/', help='Path to all manifest files you want to merge')
@@ -28,4 +27,4 @@
     for wav_path in tqdm(file_paths, total=len(file_paths)):
         transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
         sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
-    file.write(sample.encode('utf-8'))
\ No newline at end of file
+    file.write(sample.encode('utf-8'))
diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/loader.py
similarity index 96%
rename from sonosco/datasets/data_loader.py
rename to sonosco/datasets/loader.py
index f6f7642..92c3951 100644
--- a/sonosco/datasets/data_loader.py
+++ b/sonosco/datasets/loader.py
@@ -3,8 +3,8 @@
 import torch
 
 from torch.utils.data import Dataset, DataLoader, Sampler
-from .audio_dataset import AudioDataProcessor, AudioDataset
-from .data_sampler import BucketingSampler
+from .dataset import AudioDataProcessor, AudioDataset
+from .samplers import BucketingSampler
 
 
 LOGGER = logging.getLogger(__name__)
diff --git a/sonosco/datasets/data_sampler.py b/sonosco/datasets/samplers.py
similarity index 97%
rename from sonosco/datasets/data_sampler.py
rename to sonosco/datasets/samplers.py
index b3bfc14..416b754 100644
--- a/sonosco/datasets/data_sampler.py
+++ b/sonosco/datasets/samplers.py
@@ -66,4 +66,4 @@ def shuffle(self, epoch):
         g = torch.Generator()
         g.manual_seed(epoch)
         bin_ids = list(torch.randperm(len(self.bins), generator=g))
-        self.bins = [self.bins[i] for i in bin_ids]
\ No newline at end of file
+        self.bins = [self.bins[i] for i in bin_ids]
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 69cb1cb..2acd1a9 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -6,9 +6,9 @@
 
 from sonosco.common.constants import SONOSCO
 from sonosco.common.utils import setup_logging
-from sonosco.datasets.audio_dataset import AudioDataset, AudioDataProcessor
-from sonosco.datasets.data_sampler import BucketingSampler
-from sonosco.datasets.data_loader import DataLoader
+from sonosco.datasets.dataset import AudioDataset, AudioDataProcessor
+from sonosco.datasets.samplers import BucketingSampler
+from sonosco.datasets.loader import DataLoader
 from sonosco.datasets.download_datasets.librispeech import try_download_librispeech
 
 

From b46c558157e348172cdb0b26a213311f9e2460e0 Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Tue, 25 Jun 2019 13:56:55 +0200
Subject: [PATCH 55/58] restructure classes into separate modules

---
 sonosco/datasets/processor.py          |  88 +++++++++++++++
 sonosco/training/early_stopping.py     |  45 ++++++++
 sonosco/training/gradient_collector.py |  50 +++++++++
 sonosco/training/helpers.py            | 143 -------------------------
 sonosco/training/history_recorder.py   |  26 +++++
 sonosco/training/model_checkpoint.py   |  43 ++++++++
 6 files changed, 252 insertions(+), 143 deletions(-)
 create mode 100644 sonosco/datasets/processor.py
 create mode 100644 sonosco/training/early_stopping.py
 create mode 100644 sonosco/training/gradient_collector.py
 delete mode 100644 sonosco/training/helpers.py
 create mode 100644 sonosco/training/history_recorder.py
 create mode 100644 sonosco/training/model_checkpoint.py

diff --git a/sonosco/datasets/processor.py b/sonosco/datasets/processor.py
new file mode 100644
index 0000000..2bbf7a8
--- /dev/null
+++ b/sonosco/datasets/processor.py
@@ -0,0 +1,88 @@
+import logging
+import torch
+import librosa
+import numpy as np
+import sonosco.common.audio_tools as audio_tools
+import sonosco.common.utils as utils
+
+
+LOGGER = logging.getLogger(__name__)
+MIN_STRETCH = 0.7
+MAX_STRETCH = 1.3
+MIN_PITCH = 0.7
+MAX_PITCH = 1.5
+MAX_SHIFT = 4000
+
+
+class AudioDataProcessor:
+
+    def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False, **kwargs):
+        """
+        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
+        a comma. Each new line is a different sample. Example below:
+        /path/to/audio.wav,/path/to/audio.txt
+        ...
+        :param window_stride: number of seconds to skip between each window
+        :param window_size: number of seconds to use for a window of spectrogram
+        :param sample_rate: sample rate of the recordings
+        :param labels: string containing all the possible characters to map to
+        :param normalize: apply standard mean and deviation normalization to audio tensor
+        :param augment(default False): apply random tempo and gain perturbations
+        """
+        self.window_stride = window_stride
+        self.window_size = window_size
+        self.sample_rate = sample_rate
+        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
+        self.normalize = normalize
+        self.augment = augment
+
+    @property
+    def window_stride_samples(self):
+        return int(self.sample_rate * self.window_stride)
+
+    @property
+    def window_size_samples(self):
+        return int(self.sample_rate * self.window_stride)
+
+    def retrieve_file(self, audio_path):
+        sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate)
+        return sound, sample_rate
+
+    def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True):
+        augmented = audio_tools.stretch(sound, utils.random_float(MIN_STRETCH, MAX_STRETCH)) if stretch else sound
+        augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented
+        augmented = audio_tools.pitch_shift(augmented, self.sample_rate,
+                                            n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented
+        augmented = audio_tools.add_noise(augmented) if noise else augmented
+        return augmented
+
+    def parse_audio(self, audio_path, raw=False):
+        sound, sample_rate = self.retrieve_file(audio_path)
+
+        if sample_rate != self.sample_rate:
+            raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
+
+        if self.augment:
+            sound = self.augment_audio(sound)
+
+        if raw:
+            return sound
+
+        # TODO: comment why take the last element?
+        complex_spectrogram = librosa.stft(sound,
+                                           n_fft=self.window_size_samples,
+                                           hop_length=self.window_stride_samples,
+                                           win_length=self.window_size_samples)
+        spectrogram, phase = librosa.magphase(complex_spectrogram)
+        # S = log(S+1)
+        spectrogram = torch.from_numpy(np.log1p(spectrogram))
+
+        return spectrogram
+
+    def parse_transcript(self, transcript_path):
+        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
+            transcript = transcript_file.read().replace('\n', '')
+        # TODO: Is it fast enough?
+        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
+        LOGGER.debug(f"transcript_path: {transcript_path} transcript: {transcript}")
+        return transcript
diff --git a/sonosco/training/early_stopping.py b/sonosco/training/early_stopping.py
new file mode 100644
index 0000000..8eefa6b
--- /dev/null
+++ b/sonosco/training/early_stopping.py
@@ -0,0 +1,45 @@
+import logging
+import sys
+
+from .abstract_callback import AbstractCallback
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class EarlyStopping(AbstractCallback):
+    """
+    Early Stopping to terminate training early if the monitored metric did not improve
+    over a number of epochs.
+    Args:
+        monitor (string): name of the relevant loss or metric (usually 'val_loss')
+        min_delta (float): minimum change in monitored metric to qualify as an improvement
+        patience (int): number of epochs to wait for an improvement before terminating the training
+    """
+
+    def __init__(self, monitor='val_loss', min_delta=0, patience=5):
+        self.monitor = monitor
+        self.min_delta = min_delta
+        self.patience = patience
+        self.last_best = sys.float_info.max
+        self.counter = 0
+        self.stopped_epoch = 0
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if step != len(context.train_data_loader) - 1:  # only continue at end of epoch
+            return
+
+        if self.monitor not in performance_measures:
+            return
+
+        current_loss = performance_measures[self.monitor]
+        if (self.last_best - current_loss) >= self.min_delta:
+            self.last_best = current_loss
+            self.counter = 0
+        else:
+            self.counter += 1
+
+        if self.counter >= self.patience:
+            context._stop_training = True   # make ModelTrainer stop
+            LOGGER.info(f"Early stopping after epoch {epoch}")
diff --git a/sonosco/training/gradient_collector.py b/sonosco/training/gradient_collector.py
new file mode 100644
index 0000000..386b874
--- /dev/null
+++ b/sonosco/training/gradient_collector.py
@@ -0,0 +1,50 @@
+import logging
+import numpy as np
+import torch
+
+from .abstract_callback import AbstractCallback
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class LayerwiseGradientNorm(AbstractCallback):
+    """ Collects the layer-wise gradient norms for each epoch. """
+
+    def __init__(self):
+        self.layer_grads = dict()
+        self._batch_layer_grads = dict()
+
+    def __call__(self, epoch, step, performance_measures, context):
+        """
+        Store gradient norms for each batch and compute means after the
+        epoch's last batch.
+        """
+        self._store_batch_layer_grads(context.model)
+
+        if step == (len(context.train_data_loader) - 1):    # end of epoch
+            self._store_layer_grads()
+            self._batch_layer_grads = dict()
+
+    def _store_batch_layer_grads(self, model):
+        """ Store gradient norm of each layer for current batch. """
+        for name, param in model.named_parameters():
+
+            if not param.requires_grad or param.grad is None:
+                continue
+
+            if not name in self._batch_layer_grads:
+                self._batch_layer_grads[name] = []
+
+            grad_norm = torch.sqrt(torch.sum(param.grad**2)).item()
+            self._batch_layer_grads[name].append(grad_norm)
+
+    def _store_layer_grads(self):
+        """ Compute mean of all batch steps in epoch. """
+        for name, grads in self._batch_layer_grads.items():
+
+            if name not in self.layer_grads:
+                self.layer_grads[name] = []
+
+            layer_epoch_grad = np.mean(grads)
+            self.layer_grads[name].append(layer_epoch_grad)
diff --git a/sonosco/training/helpers.py b/sonosco/training/helpers.py
deleted file mode 100644
index 893c14d..0000000
--- a/sonosco/training/helpers.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import logging
-import sys
-import os.path as path
-import numpy as np
-import torch
-
-from collections import defaultdict
-from .abstract_callback import AbstractCallback
-
-
-LOGGER = logging.getLogger(__name__)
-
-
-class HistoryRecorder(AbstractCallback):
-    """ Records all losses and metrics during training. """
-
-    def __init__(self, epoch_steps):
-        self.history = defaultdict(list)
-        self._epoch_steps = epoch_steps
-
-    def __call__(self, epoch, step, performance_measures, context):
-
-        if step % self._epoch_steps == 0:  # only record at end of epoch
-            return
-
-        for key, value in performance_measures.items():
-            if type(value) == torch.Tensor:
-                value = value.item()
-            self.history[key].append(value)
-
-
-class ModelCheckpoint(AbstractCallback):
-    """
-    Saves the model and optimizer state at the point with lowest validation error throughout training.
-    Args:
-        output_path (string): path to directory where the checkpoint will be saved to
-        model_name (string): name of the checkpoint file
-    """
-
-    def __init__(self, output_path, model_name='model_checkpoint.pt'):
-        self.output_path = path.join(output_path, model_name)
-        self.best_val_score = sys.float_info.max
-
-    def __call__(self, epoch, step, performance_measures, context):
-
-        if 'val_loss' not in performance_measures:
-            return
-
-        if performance_measures['val_loss'] < self.best_val_score:
-            self.best_val_score = performance_measures['val_loss']
-            self._save_checkpoint(context.model, context.optimizer, epoch)
-
-    def _save_checkpoint(self, model, optimizer, epoch):
-        LOGGER.info("Saving model at checkpoint.")
-        model.eval()
-        model_state_dict = model.state_dict()
-        optimizer_state_dict = optimizer.state_dict()
-        torch.save({'arch': model.__class__.__name__,
-                    'epoch': epoch,
-                    'model_state_dict': model_state_dict,
-                    'optimizer_state_dict': optimizer_state_dict
-                    }, self.output_path)
-        model.train()
-
-
-class LayerwiseGradientNorm(AbstractCallback):
-    """ Collects the layer-wise gradient norms for each epoch. """
-
-    def __init__(self):
-        self.layer_grads = dict()
-        self._batch_layer_grads = dict()
-
-    def __call__(self, epoch, step, performance_measures, context):
-        """
-        Store gradient norms for each batch and compute means after the
-        epoch's last batch.
-        """
-        self._store_batch_layer_grads(context.model)
-
-        if step == (len(context.train_data_loader) - 1):    # end of epoch
-            self._store_layer_grads()
-            self._batch_layer_grads = dict()
-
-    def _store_batch_layer_grads(self, model):
-        """ Store gradient norm of each layer for current batch. """
-        for name, param in model.named_parameters():
-
-            if not param.requires_grad or param.grad is None:
-                continue
-
-            if not name in self._batch_layer_grads:
-                self._batch_layer_grads[name] = []
-
-            grad_norm = torch.sqrt(torch.sum(param.grad**2)).item()
-            self._batch_layer_grads[name].append(grad_norm)
-
-    def _store_layer_grads(self):
-        """ Compute mean of all batch steps in epoch. """
-        for name, grads in self._batch_layer_grads.items():
-
-            if name not in self.layer_grads:
-                self.layer_grads[name] = []
-
-            layer_epoch_grad = np.mean(grads)
-            self.layer_grads[name].append(layer_epoch_grad)
-
-
-class EarlyStopping(AbstractCallback):
-    """
-    Early Stopping to terminate training early if the monitored metric did not improve
-    over a number of epochs.
-    Args:
-        monitor (string): name of the relevant loss or metric (usually 'val_loss')
-        min_delta (float): minimum change in monitored metric to qualify as an improvement
-        patience (int): number of epochs to wait for an improvement before terminating the training
-    """
-
-    def __init__(self, monitor='val_loss', min_delta=0, patience=5):
-        self.monitor = monitor
-        self.min_delta = min_delta
-        self.patience = patience
-        self.last_best = sys.float_info.max
-        self.counter = 0
-        self.stopped_epoch = 0
-
-    def __call__(self, epoch, step, performance_measures, context):
-
-        if step != len(context.train_data_loader) - 1:  # only continue at end of epoch
-            return
-
-        if self.monitor not in performance_measures:
-            return
-
-        current_loss = performance_measures[self.monitor]
-        if (self.last_best - current_loss) >= self.min_delta:
-            self.last_best = current_loss
-            self.counter = 0
-        else:
-            self.counter += 1
-
-        if self.counter >= self.patience:
-            context._stop_training = True   # make ModelTrainer stop
-            LOGGER.info(f"Early stopping after epoch {epoch}")
diff --git a/sonosco/training/history_recorder.py b/sonosco/training/history_recorder.py
new file mode 100644
index 0000000..5737a7b
--- /dev/null
+++ b/sonosco/training/history_recorder.py
@@ -0,0 +1,26 @@
+import logging
+import torch
+
+from collections import defaultdict
+from .abstract_callback import AbstractCallback
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class HistoryRecorder(AbstractCallback):
+    """ Records all losses and metrics during training. """
+
+    def __init__(self, epoch_steps):
+        self.history = defaultdict(list)
+        self._epoch_steps = epoch_steps
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if step % self._epoch_steps == 0:  # only record at end of epoch
+            return
+
+        for key, value in performance_measures.items():
+            if type(value) == torch.Tensor:
+                value = value.item()
+            self.history[key].append(value)
diff --git a/sonosco/training/model_checkpoint.py b/sonosco/training/model_checkpoint.py
new file mode 100644
index 0000000..78f8f4a
--- /dev/null
+++ b/sonosco/training/model_checkpoint.py
@@ -0,0 +1,43 @@
+import logging
+import sys
+import os.path as path
+import torch
+
+from .abstract_callback import AbstractCallback
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ModelCheckpoint(AbstractCallback):
+    """
+    Saves the model and optimizer state at the point with lowest validation error throughout training.
+    Args:
+        output_path (string): path to directory where the checkpoint will be saved to
+        model_name (string): name of the checkpoint file
+    """
+
+    def __init__(self, output_path, model_name='model_checkpoint.pt'):
+        self.output_path = path.join(output_path, model_name)
+        self.best_val_score = sys.float_info.max
+
+    def __call__(self, epoch, step, performance_measures, context):
+
+        if 'val_loss' not in performance_measures:
+            return
+
+        if performance_measures['val_loss'] < self.best_val_score:
+            self.best_val_score = performance_measures['val_loss']
+            self._save_checkpoint(context.model, context.optimizer, epoch)
+
+    def _save_checkpoint(self, model, optimizer, epoch):
+        LOGGER.info("Saving model at checkpoint.")
+        model.eval()
+        model_state_dict = model.state_dict()
+        optimizer_state_dict = optimizer.state_dict()
+        torch.save({'arch': model.__class__.__name__,
+                    'epoch': epoch,
+                    'model_state_dict': model_state_dict,
+                    'optimizer_state_dict': optimizer_state_dict
+                    }, self.output_path)
+        model.train()

From 2ffa5120ef8b29e5936821f1e49d9fcf1633798f Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Tue, 25 Jun 2019 14:06:58 +0200
Subject: [PATCH 56/58] make add_noise a class member

---
 sonosco/common/audio_tools.py  | 6 ------
 sonosco/common/noise_makers.py | 8 +++++++-
 sonosco/datasets/processor.py  | 7 ++++++-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py
index 807e79b..fa53021 100644
--- a/sonosco/common/audio_tools.py
+++ b/sonosco/common/audio_tools.py
@@ -21,12 +21,6 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_
     subprocess.call([f"sox {source}  -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True)
 
 
-def add_noise(audio, noise_maker: NoiseMaker = None):
-    if noise_maker is None:
-        noise_maker = GaussianNoiseMaker()
-    return noise_maker(audio)
-
-
 def shift(audio, n_samples=1600):
     return np.roll(audio, n_samples)
 
diff --git a/sonosco/common/noise_makers.py b/sonosco/common/noise_makers.py
index 551e2ec..a280e94 100644
--- a/sonosco/common/noise_makers.py
+++ b/sonosco/common/noise_makers.py
@@ -1,12 +1,18 @@
 import numpy as np
 
+from abc import ABC, abstractmethod
 
-class NoiseMaker:
 
+class NoiseMaker(ABC):
+
+    @abstractmethod
     def __call__(self, audio):
         """Adds noise to the audio signal."""
         pass
 
+    def add_noise(self, audio):
+        return self(audio)
+
 
 class GaussianNoiseMaker(NoiseMaker):
 
diff --git a/sonosco/datasets/processor.py b/sonosco/datasets/processor.py
index 2bbf7a8..d8e91e2 100644
--- a/sonosco/datasets/processor.py
+++ b/sonosco/datasets/processor.py
@@ -4,6 +4,7 @@
 import numpy as np
 import sonosco.common.audio_tools as audio_tools
 import sonosco.common.utils as utils
+import sonosco.common.noise_makers as noise_makers
 
 
 LOGGER = logging.getLogger(__name__)
@@ -53,7 +54,11 @@ def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True
         augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented
         augmented = audio_tools.pitch_shift(augmented, self.sample_rate,
                                             n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented
-        augmented = audio_tools.add_noise(augmented) if noise else augmented
+
+        if noise:
+            noise_maker = noise_makers.GaussianNoiseMaker()
+            augmented = noise_maker.add_noise(augmented) if noise else augmented
+
         return augmented
 
     def parse_audio(self, audio_path, raw=False):

From 70b6bfb123e1582c7bf57a5cfcf2033bdfe9ff2c Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Tue, 25 Jun 2019 14:18:26 +0200
Subject: [PATCH 57/58] restructure model loader

---
 sonosco/datasets/loader.py             |  28 +--
 sonosco/loader.py                      | 250 -------------------------
 sonosco/models/deepspeech2.py          |  99 +---------
 sonosco/models/deepspeech2_sonosco.py  |  11 +-
 sonosco/{model.py => models/loader.py} |  53 +-----
 sonosco/models/modules.py              | 101 ++++++++++
 sonosco/models/saver.py                |  56 ++++++
 sonosco/{ => models}/serialization.py  |   0
 8 files changed, 178 insertions(+), 420 deletions(-)
 delete mode 100644 sonosco/loader.py
 rename sonosco/{model.py => models/loader.py} (66%)
 create mode 100644 sonosco/models/modules.py
 create mode 100644 sonosco/models/saver.py
 rename sonosco/{ => models}/serialization.py (100%)

diff --git a/sonosco/datasets/loader.py b/sonosco/datasets/loader.py
index 92c3951..e23c2fd 100644
--- a/sonosco/datasets/loader.py
+++ b/sonosco/datasets/loader.py
@@ -1,8 +1,8 @@
-import numpy as np
 import logging
 import torch
+import torch.nn
 
-from torch.utils.data import Dataset, DataLoader, Sampler
+from torch.utils.data import DataLoader
 from .dataset import AudioDataProcessor, AudioDataset
 from .samplers import BucketingSampler
 
@@ -20,23 +20,23 @@ def __init__(self, *args, **kwargs):
         self.collate_fn = self._collate_fn
 
     def _collate_fn(self, batch):
-        #sort the batch in decreasing order of sequence length
+        # sort the batch in decreasing order of sequence length
         batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
 
-        #pad the tensors to have equal lengths, therefore transpose the tensors in
-        #the batch. The tensors have shape: freq_size x sequence_length
-        #and need to be of shape: sequence_length x  freq_length, as sequence length differs
-        #but not the freq_length
+        # pad the tensors to have equal lengths, therefore transpose the tensors in
+        # the batch. The tensors have shape: freq_size x sequence_length
+        # and need to be of shape: sequence_length x  freq_length, as sequence length differs
+        # but not the freq_length
         inputs = torch.nn.utils.rnn.pad_sequence(list(map(lambda x: x[0].transpose(0,1), batch)), batch_first=True)
 
-        #inputs need to be transposed back from shape batch_size x sequence_length x  freq_length
-        #to batch_size x freq_length x sequence_length. Additionally, unsqueeze tensor
-        inputs = inputs.transpose(1,2).unsqueeze(1)
-        input_lengths = torch.IntTensor(list(map(lambda x: x[0].size(1), batch))) #create tensor of input lengths
+        # inputs need to be transposed back from shape batch_size x sequence_length x  freq_length
+        # to batch_size x freq_length x sequence_length. Additionally, unsqueeze tensor
+        inputs = inputs.transpose(1, 2).unsqueeze(1)
+        input_lengths = torch.IntTensor(list(map(lambda x: x[0].size(1), batch))) # create tensor of input lengths
 
-        targets_arr = list(zip(*batch))[1] #extract targets array from batch ( batch is array of tuples)
-        target_lengths = torch.IntTensor(list(map(lambda x: len(x),targets_arr))) #create tensor of target lengths
-        targets = torch.cat(list(map(lambda x: torch.IntTensor(x), targets_arr))) #create tensor of targets
+        targets_arr = list(zip(*batch))[1] # extract targets array from batch ( batch is array of tuples)
+        target_lengths = torch.IntTensor(list(map(lambda x: len(x), targets_arr))) # create tensor of target lengths
+        targets = torch.cat(list(map(lambda x: torch.IntTensor(x), targets_arr))) # create tensor of targets
 
         return inputs, targets, input_lengths, target_lengths
 
diff --git a/sonosco/loader.py b/sonosco/loader.py
deleted file mode 100644
index b73b065..0000000
--- a/sonosco/loader.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# ----------------------------------------------------------------------------
-# Based on SeanNaren's deepspeech.pytorch:
-# https://github.com/SeanNaren/deepspeech.pytorch
-# ----------------------------------------------------------------------------
-
-import math
-import warnings
-from typing import Tuple
-
-import librosa
-import numpy as np
-import torch
-import torchaudio
-from scipy import signal
-from torch.utils.data import Dataset, DataLoader, Sampler
-
-# FIXME: Deprecated functions usage
-from torch.distributed.deprecated import get_rank
-from torch.distributed.deprecated import get_world_size
-
-windows = {"bartlett": torch.bartlett_window,
-           "blackman": torch.blackman_window,
-           "hamming": torch.hamming_window,
-           "hann": torch.hann_window}
-
-windows_legacy = {'hamming': signal.hamming,
-                  'hann': signal.hann,
-                  'blackman': signal.blackman,
-                  'bartlett': signal.bartlett}
-
-
-class DataProcessor(object):
-    def __init__(self, audio_conf, labels="abc", normalize=False, augment=False, legacy=True):
-        """
-        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
-        a comma. Each new line is a different sample. Example below:
-
-        /path/to/audio.wav,/path/to/audio.txt
-        ...
-
-        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
-        :param labels: String containing all the possible characters to map to
-        :param normalize: Apply standard mean and deviation normalization to audio tensor
-        :param augment(default False):  Apply random tempo and gain perturbations
-        """
-        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
-        self.window_stride = audio_conf["window_stride"]
-        self.window_size = audio_conf["window_size"]
-        self.sample_rate = audio_conf["sample_rate"]
-        self.window = windows_legacy.get(audio_conf["window"], windows_legacy["hamming"]) if legacy else windows.get(audio_conf["window"], windows["hamming"])
-        self.normalize = normalize
-        self.augment = augment
-        self.legacy = legacy
-        self.transform = torchaudio.transforms.Spectrogram(n_fft=int(self.sample_rate * self.window_size),
-                                                           hop=int(self.sample_rate * self.window_stride),
-                                                           window=self.window, normalize=self.normalize)
-
-    @staticmethod
-    def retrieve_file(audio_path, legacy=True):
-        sound, sample_rate = torchaudio.load(audio_path)
-        if legacy:
-            sound = sound.numpy().T
-            if len(sound.shape) > 1:
-                if sound.shape[1] == 1:
-                    sound = sound.squeeze()
-                else:
-                    sound = sound.mean(axis=1)
-        return sound, sample_rate
-
-    @staticmethod
-    def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)):
-        """
-        Changes tempo and gain of the wave
-        """
-        warnings.warn("Augmentation is not implemented")  # TODO: Implement
-        return sound
-
-    def parse_audio(self, audio_path):
-        sound, sample_rate = self.retrieve_file(audio_path, self.legacy)
-        if sample_rate != self.sample_rate:
-            raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!")
-
-        if self.augment:
-            sound = self.augment_audio(sound)
-
-        if self.legacy:
-            n_fft = int(self.sample_rate * self.window_size)
-            win_length = n_fft
-            hop_length = int(self.sample_rate * self.window_stride)
-            # STFT
-            D = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
-                             win_length=win_length, window=self.window)
-            spectrogram, phase = librosa.magphase(D)
-            # S = log(S+1)
-
-            spectrogram = torch.FloatTensor(np.log1p(spectrogram))
-        else:
-            # TODO: Why these are different from librosa.stft?
-            sound = sound.cuda()
-            spectrogram = self.transform(sound)[-1, :, :].transpose(0, 1)
-
-            # spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()),
-            #                          n_fft=int(self.sample_rate * self.window_size),
-            #                          hop_length=int(self.sample_rate * self.window_stride),
-            #                          win_length=int(self.sample_rate * self.window_size),
-            #                          window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1]
-
-        if self.normalize:
-            mean = spectrogram.mean()
-            std = spectrogram.std()
-            spectrogram.add_(-mean)
-            spectrogram.div_(std)
-
-        return spectrogram
-
-    def parse_transcript(self, transcript_path):
-        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
-            transcript = transcript_file.read().replace('\n', '')
-        # TODO: Is it fast enough?
-        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
-        return transcript
-
-
-class AudioDataset(Dataset):
-    def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False, legacy=True):
-        """
-        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
-        a comma. Each new line is a different sample. Example below:
-
-        /path/to/audio.wav,/path/to/audio.txt
-        ...
-
-        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
-        :param manifest_filepath: Path to manifest csv as describe above
-        :param labels: String containing all the possible characters to map to
-        :param normalize: Apply standard mean and deviation normalization to audio tensor
-        :param augment(default False):  Apply random tempo and gain perturbations
-        """
-        super(AudioDataset, self).__init__()
-        with open(manifest_filepath) as f:
-            ids = f.readlines()
-        ids = [x.strip().split(',') for x in ids]
-        self.ids = ids
-        self.size = len(ids)
-        self.processor = DataProcessor(audio_conf, labels, normalize, augment, legacy)
-
-    def __getitem__(self, index):
-        sample = self.ids[index]
-        audio_path, transcript_path = sample[0], sample[1]
-
-        spectrogram = self.processor.parse_audio(audio_path)
-        transcript = self.processor.parse_transcript(transcript_path)
-
-        return spectrogram, transcript
-
-    def __len__(self):
-        return self.size
-
-
-# TODO: Optimise
-def _collate_fn(batch):
-    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
-    longest_sample = batch[0][0]
-    freq_size, max_seqlength = longest_sample.size()
-    minibatch_size = len(batch)
-    inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
-    input_percentages = torch.FloatTensor(minibatch_size)
-    target_sizes = np.zeros(minibatch_size, dtype=np.int32)
-
-    # TODO: Numpy broadcasting magic
-    targets = []
-
-    for x in range(minibatch_size):
-        inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0])
-        input_percentages[x] = batch[x][0].size(1) / float(max_seqlength)
-        target_sizes[x] = len(batch[x][1])
-        targets.extend(batch[x][1])
-
-    return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes)
-
-
-class AudioDataLoader(DataLoader):
-    def __init__(self, *args, **kwargs):
-        """
-        Creates a data loader for AudioDatasets.
-        """
-        super(AudioDataLoader, self).__init__(*args, **kwargs)
-        self.collate_fn = _collate_fn
-
-
-class BucketingSampler(Sampler):
-    def __init__(self, data_source, batch_size=1):
-        """
-        Samples batches assuming they are in order of size to batch similarly sized samples together.
-        """
-        super(BucketingSampler, self).__init__(data_source)
-        self.data_source = data_source
-        ids = list(range(0, len(data_source)))
-        # TODO: Optimise
-        self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]
-
-    def __iter__(self):
-        for ids in self.bins:
-            np.random.shuffle(ids)
-            yield ids
-
-    def __len__(self):
-        return len(self.bins)
-
-    def shuffle(self, epoch):
-        np.random.shuffle(self.bins)
-
-
-# TODO: Optimise
-class DistributedBucketingSampler(Sampler):
-    def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
-        """
-        Samples batches assuming they are in order of size to batch similarly sized samples together.
-        """
-        super(DistributedBucketingSampler, self).__init__(data_source)
-        if num_replicas is None:
-            num_replicas = get_world_size()
-        if rank is None:
-            rank = get_rank()
-        self.data_source = data_source
-        self.ids = list(range(0, len(data_source)))
-        self.batch_size = batch_size
-        self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
-        self.total_size = self.num_samples * self.num_replicas
-
-    def __iter__(self):
-        offset = self.rank
-        # add extra samples to make it evenly divisible
-        bins = self.bins + self.bins[:(self.total_size - len(self.bins))]
-        assert len(bins) == self.total_size
-        samples = bins[offset::self.num_replicas]  # Get every Nth bin, starting from rank
-        return iter(samples)
-
-    def __len__(self):
-        return self.num_samples
-
-    def shuffle(self, epoch):
-        # deterministically shuffle based on epoch
-        g = torch.Generator()
-        g.manual_seed(epoch)
-        bin_ids = list(torch.randperm(len(self.bins), generator=g))
-        self.bins = [self.bins[i] for i in bin_ids]
diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py
index 35645d2..0fff74f 100644
--- a/sonosco/models/deepspeech2.py
+++ b/sonosco/models/deepspeech2.py
@@ -2,111 +2,16 @@
 # Based on SeanNaren's deepspeech.pytorch:
 # https://github.com/SeanNaren/deepspeech.pytorch
 # ----------------------------------------------------------------------------
-
 import math
-from collections import OrderedDict
-
 import torch
 import logging
 import torch.nn as nn
-import torch.nn.functional as F
 
+from collections import OrderedDict
+from .modules import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, supported_rnns_inv
 
 LOGGER = logging.getLogger(__name__)
 
-supported_rnns = {
-    'lstm': nn.LSTM,
-    'rnn': nn.RNN,
-    'gru': nn.GRU
-}
-supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())
-
-
-class SequenceWise(nn.Module):
-    def __init__(self, module):
-        """
-        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
-        Allows handling of variable sequence lengths and minibatch sizes.
-        :param module: Module to apply input to.
-        """
-        super(SequenceWise, self).__init__()
-        self.module = module
-
-    def forward(self, x):
-        t, n = x.size(0), x.size(1)
-        x = x.view(t * n, -1)
-        x = self.module(x)
-        x = x.view(t, n, -1)
-        return x
-
-    def __repr__(self):
-        tmpstr = self.__class__.__name__ + ' (\n'
-        tmpstr += self.module.__repr__()
-        tmpstr += ')'
-        return tmpstr
-
-
-class MaskConv(nn.Module):
-    def __init__(self, seq_module):
-        """
-        Adds padding to the output of the module based on the given lengths. This is to ensure that the
-        results of the model do not change when batch sizes change during inference.
-        Input needs to be in the shape of (BxCxDxT)
-        :param seq_module: The sequential module containing the conv stack.
-        """
-        super(MaskConv, self).__init__()
-        self.seq_module = seq_module
-
-    def forward(self, x, lengths):
-        """
-        :param x: The input of size BxCxDxT
-        :param lengths: The actual length of each sequence in the batch
-        :return: Masked output from the module
-        """
-        for module in self.seq_module:
-            x = module(x)
-            mask = torch.ByteTensor(x.size()).fill_(0)
-            if x.is_cuda:
-                mask = mask.cuda()
-            for i, length in enumerate(lengths):
-                length = length.item()
-                if (mask[i].size(2) - length) > 0:
-                    mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
-            x = x.masked_fill(mask, 0)
-        return x, lengths
-
-
-class InferenceBatchSoftmax(nn.Module):
-    def forward(self, input_):
-        if not self.training:
-            return F.softmax(input_, dim=-1)
-        else:
-            return input_
-
-
-class BatchRNN(nn.Module):
-    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=False):
-        super(BatchRNN, self).__init__()
-        self.bidirectional = bidirectional
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
-        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
-                            bidirectional=bidirectional, bias=True)
-
-    def flatten_parameters(self):
-        self.rnn.flatten_parameters()
-
-    def forward(self, x, output_lengths):
-        if self.batch_norm is not None:
-            x = self.batch_norm(x)
-        x = nn.utils.rnn.pack_padded_sequence(x, output_lengths)
-        x, h = self.rnn(x)
-        x, _ = nn.utils.rnn.pad_packed_sequence(x)
-        if self.bidirectional:
-            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
-        return x
-
 
 class DeepSpeech2(nn.Module):
     def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5, audio_conf=None,
diff --git a/sonosco/models/deepspeech2_sonosco.py b/sonosco/models/deepspeech2_sonosco.py
index 1e1b1c3..c95e7eb 100644
--- a/sonosco/models/deepspeech2_sonosco.py
+++ b/sonosco/models/deepspeech2_sonosco.py
@@ -1,13 +1,10 @@
 import math
-from collections import OrderedDict
-from dataclasses import field
 
-import torch
 from torch import nn
-
-from models.deepspeech2 import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, \
-    supported_rnns_inv
-from serialization import serializable
+from .modules import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, supported_rnns_inv
+from .serialization import serializable
+from collections import OrderedDict
+from dataclasses import field
 
 
 @serializable
diff --git a/sonosco/model.py b/sonosco/models/loader.py
similarity index 66%
rename from sonosco/model.py
rename to sonosco/models/loader.py
index c943244..66e0b11 100644
--- a/sonosco/model.py
+++ b/sonosco/models/loader.py
@@ -1,64 +1,13 @@
 import logging
-
 import torch
 import deprecation
-import inspect
 import torch.nn as nn
 
-from common.class_utils import get_constructor_args, get_class_by_name
-from serialization import is_serializable
+from sonosco.common.class_utils import get_constructor_args, get_class_by_name
 
 LOGGER = logging.getLogger(__name__)
 
 
-class Saver:
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    @deprecation.deprecated(
-        details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead")
-    def save_model_simple(self, model: nn.Module, path: str) -> None:
-        """
-       Simply saves the model using pickle protocol.
-        Args:
-            model (nn.Module): model to save
-            path (str) : path where to save the model
-
-        Returns:
-
-        """
-        torch.save(model, path)
-
-    def save_model(self, model: nn.Module, path: str) -> None:
-        """
-        Saves the model using pickle protocol.
-
-        If the infer_structure is True this method infers all the meta parameters of the model and save them together
-        with learnable parameters.
-
-        If the infer_structure is False and method specified by serialize_method_name exists, the return value of the
-        serialize_method_name method is saved.
-
-        If neither of above only learnable parameters a.k.a. state_dict are saved.
-
-        Args:
-            model (nn.Module): model to save
-            path (str) : path where to save the model
-            infer_structure (bool): indicator whether to infer the model structure
-            serialize_method_name (str): name of the function that this method should call in order to serialize the
-                model. Must return dict.
-
-        Returns:
-
-        """
-        if is_serializable(model):
-            entity_to_save = model.__serialize__()
-            torch.save(entity_to_save, path)
-        else:
-            raise TypeError("Only @serializable class can be serialized")
-
-
 class Loader:
 
     @deprecation.deprecated(
diff --git a/sonosco/models/modules.py b/sonosco/models/modules.py
new file mode 100644
index 0000000..015dc5a
--- /dev/null
+++ b/sonosco/models/modules.py
@@ -0,0 +1,101 @@
+import torch
+import logging
+import torch.nn as nn
+import torch.nn.functional as functional
+
+
+LOGGER = logging.getLogger(__name__)
+
+supported_rnns = {
+    'lstm': nn.LSTM,
+    'rnn': nn.RNN,
+    'gru': nn.GRU
+}
+
+supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())
+
+
+class SequenceWise(nn.Module):
+    def __init__(self, module):
+        """
+        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
+        Allows handling of variable sequence lengths and minibatch sizes.
+        :param module: Module to apply input to.
+        """
+        super(SequenceWise, self).__init__()
+        self.module = module
+
+    def forward(self, x):
+        t, n = x.size(0), x.size(1)
+        x = x.view(t * n, -1)
+        x = self.module(x)
+        x = x.view(t, n, -1)
+        return x
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + ' (\n'
+        tmpstr += self.module.__repr__()
+        tmpstr += ')'
+        return tmpstr
+
+
+class MaskConv(nn.Module):
+    def __init__(self, seq_module):
+        """
+        Adds padding to the output of the module based on the given lengths. This is to ensure that the
+        results of the model do not change when batch sizes change during inference.
+        Input needs to be in the shape of (BxCxDxT)
+        :param seq_module: The sequential module containing the conv stack.
+        """
+        super(MaskConv, self).__init__()
+        self.seq_module = seq_module
+
+    def forward(self, x, lengths):
+        """
+        :param x: The input of size BxCxDxT
+        :param lengths: The actual length of each sequence in the batch
+        :return: Masked output from the module
+        """
+        for module in self.seq_module:
+            x = module(x)
+            mask = torch.ByteTensor(x.size()).fill_(0)
+            if x.is_cuda:
+                mask = mask.cuda()
+            for i, length in enumerate(lengths):
+                length = length.item()
+                if (mask[i].size(2) - length) > 0:
+                    mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1)
+            x = x.masked_fill(mask, 0)
+        return x, lengths
+
+
+class InferenceBatchSoftmax(nn.Module):
+    def forward(self, input_):
+        if not self.training:
+            return functional.softmax(input_, dim=-1)
+        else:
+            return input_
+
+
+class BatchRNN(nn.Module):
+    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=False):
+        super(BatchRNN, self).__init__()
+        self.bidirectional = bidirectional
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
+        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
+                            bidirectional=bidirectional, bias=True)
+
+    def flatten_parameters(self):
+        self.rnn.flatten_parameters()
+
+    def forward(self, x, output_lengths):
+        if self.batch_norm is not None:
+            x = self.batch_norm(x)
+        x = nn.utils.rnn.pack_padded_sequence(x, output_lengths)
+        x, h = self.rnn(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(x)
+        if self.bidirectional:
+            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
+        return x
diff --git a/sonosco/models/saver.py b/sonosco/models/saver.py
new file mode 100644
index 0000000..4198a2c
--- /dev/null
+++ b/sonosco/models/saver.py
@@ -0,0 +1,56 @@
+import logging
+import torch
+import deprecation
+import torch.nn as nn
+
+from .serialization import is_serializable
+
+LOGGER = logging.getLogger(__name__)
+
+
+class Saver:
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    @deprecation.deprecated(
+        details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead")
+    def save_model_simple(self, model: nn.Module, path: str) -> None:
+        """
+       Simply saves the model using pickle protocol.
+        Args:
+            model (nn.Module): model to save
+            path (str) : path where to save the model
+
+        Returns:
+
+        """
+        torch.save(model, path)
+
+    def save_model(self, model: nn.Module, path: str) -> None:
+        """
+        Saves the model using pickle protocol.
+
+        If the infer_structure is True this method infers all the meta parameters of the model and save them together
+        with learnable parameters.
+
+        If the infer_structure is False and method specified by serialize_method_name exists, the return value of the
+        serialize_method_name method is saved.
+
+        If neither of above only learnable parameters a.k.a. state_dict are saved.
+
+        Args:
+            model (nn.Module): model to save
+            path (str) : path where to save the model
+            infer_structure (bool): indicator whether to infer the model structure
+            serialize_method_name (str): name of the function that this method should call in order to serialize the
+                model. Must return dict.
+
+        Returns:
+
+        """
+        if is_serializable(model):
+            entity_to_save = model.__serialize__()
+            torch.save(entity_to_save, path)
+        else:
+            raise TypeError("Only @serializable class can be serialized")
diff --git a/sonosco/serialization.py b/sonosco/models/serialization.py
similarity index 100%
rename from sonosco/serialization.py
rename to sonosco/models/serialization.py

From eee49f6b53fb785c0ec189eb56289b9e2c71356d Mon Sep 17 00:00:00 2001
From: yuriyarabskyy <yuriy.arabskyy@gmail.com>
Date: Tue, 25 Jun 2019 14:20:19 +0200
Subject: [PATCH 58/58] use f strings

---
 sonosco/models/serialization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sonosco/models/serialization.py b/sonosco/models/serialization.py
index 9f905b6..4f824af 100644
--- a/sonosco/models/serialization.py
+++ b/sonosco/models/serialization.py
@@ -49,7 +49,7 @@ def __create_serialize_body(cls, fields_to_serialize):
         elif is_dataclass(field.type):
             body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serialize__()"))
         elif __is_nn_class(field.type):
-            body_lines.append("'{}': {".format(field.name))
+            body_lines.append(f"'{field.name}': {{")
             __extract_from_nn(cls, body_lines)
             body_lines.append("}")
         else:
@@ -75,7 +75,7 @@ def __throw_unsupported_data_type():
 
 
 def __create_dict_entry(key, value):
-    return f'\'{key}\': {value},'
+    return f"'{key}': {value},"
 
 
 def __is_primitive(obj):