From 7bf80fb96819e3c51eb3a5087a428dfa1356b830 Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sat, 25 May 2019 12:17:40 +0200 Subject: [PATCH 01/58] Added gitignore and try catch on apex --- .gitignore | 122 ++++++++++++++++++++++++++++++++++++++++++++++++ modelwrapper.py | 15 +++--- 2 files changed, 131 insertions(+), 6 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7a0e43a --- /dev/null +++ b/.gitignore @@ -0,0 +1,122 @@ +# Created by .ignore support plugin (hsz.mobi) +.idea/ + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensionsg +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +### VirtualEnv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +pip-selfcheck.json + diff --git a/modelwrapper.py b/modelwrapper.py index 3a0ec81..b313603 100644 --- a/modelwrapper.py +++ b/modelwrapper.py @@ -14,8 +14,13 @@ import torch.distributed as dist import torch.utils.data.distributed -from apex.fp16_utils import FP16_Optimizer -from apex.parallel import DistributedDataParallel + +try: + from apex.fp16_utils import FP16_Optimizer + from apex.parallel import DistributedDataParallel +except Exception as e: + print(f"Apex import failed: {e}") + from tqdm import tqdm from warpctc_pytorch import CTCLoss @@ -320,7 +325,7 @@ def train(self, seed, cuda, mixed_precision, world_size, gpu_rank, rank, save_fo train_sampler.shuffle(epoch) def validate(self): - + pass def test(self): torch.set_grad_enabled(False) @@ -386,8 +391,7 @@ def test(self): if save_output: np.save(output_path, output_data) - -def infer(self, sound): + def infer(self, sound): pass @staticmethod @@ -401,7 +405,6 @@ def get_default_path(def_path: str) -> str: default = latest_subdir + "/final.pth" return default - def print_training_info(self, epoch, loss, cer, wer): print(f"\nTraining Information\n " + \ f"- Epoch:\t{epoch}\n " + \ From 855e5162ffcf9984db6c96e3da12a982223a56c8 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sat, 25 May 2019 12:28:20 +0200 Subject: [PATCH 02/58] add requirements and move code to a subfolder --- STT_srv.py | 43 ---- config/infer.yaml | 21 -- config/test.yaml | 9 - config/train.yaml | 49 ----- decoders/__init__.py | 0 decoders/beam_decoder.py | 75 ------- decoders/decoder.py | 85 -------- decoders/greedy_decoder.py | 74 ------- infer.py | 24 --- loader.py | 248 ---------------------- models/__init__.py | 0 models/deepspeech2.py | 269 ------------------------ modelwrapper.py | 413 ------------------------------------- test.py | 18 -- train.py | 18 -- utils.py | 51 ----- 16 files changed, 1397 deletions(-) delete mode 100644 STT_srv.py delete mode 100644 config/infer.yaml delete mode 100644 config/test.yaml delete mode 100644 config/train.yaml delete mode 100644 decoders/__init__.py delete mode 100644 decoders/beam_decoder.py delete mode 100644 decoders/decoder.py delete mode 100644 decoders/greedy_decoder.py delete mode 100644 infer.py delete mode 100644 loader.py delete mode 100644 models/__init__.py delete mode 100644 models/deepspeech2.py delete mode 100644 modelwrapper.py delete mode 100644 test.py delete mode 100644 train.py delete mode 100644 utils.py diff --git a/STT_srv.py b/STT_srv.py deleted file mode 100644 index 7013756..0000000 --- a/STT_srv.py +++ /dev/null @@ -1,43 +0,0 @@ -import os - -from roboy_cognition_msgs.msg import RecognizedSpeech -from roboy_cognition_msgs.srv import RecognizeSpeech - -from asr_interface import IAsr -import rclpy -from rclpy.node import Node - - -class SonoscoROS2(Node): - def __init__(self): - super().__init__('stt') - self.publisher = self.create_publisher(RecognizedSpeech, '/roboy/cognition/speech/recognition') - self.srv = self.create_service(RecognizeSpeech, '/roboy/cognition/speech/recognition/recognize', self.asr_callback) - print("Ready to /roboy/cognition/speech/recognition/recognize") - print(f"Roboy Sonosco running with PID: {os.getpid()}") - self.i=IAsr() - print(f"Status: Speech recognition is ready now!") - print("Roboy Sonosco is ready!") - - def asr_callback(self, request, response): - response.success = True - self.get_logger().info('Incoming Audio') - msg = RecognizedSpeech() - self.i.inference_audio(request) - self.publisher.publish(msg) - return response - - -def main(args=None): - rclpy.init(args=args) - - stt = SonoscoROS2() - - while rclpy.ok(): - rclpy.spin_once(stt) - - rclpy.shutdown() - - -if __name__ == '__main__': - main() diff --git a/config/infer.yaml b/config/infer.yaml deleted file mode 100644 index be61617..0000000 --- a/config/infer.yaml +++ /dev/null @@ -1,21 +0,0 @@ -infer: - model_name: "" - audio_path: "" # Audio file to predict on - - sample_rate: 16000 # Sample rate - window_size: 0.02 # Window size for spectrogram in seconds - window_stride: 0.01 # Window stride for spectrogram in seconds - window: 'hamming' # Window type for spectrogram generation - - beam_decoder: False # Turn on beam decoder. otherwise - greedy - alpha: 0.8 - beam_width: 10 - beta: 1 - cutoff_prob: 1.0 - cutoff_top_n: 40 - lm_path: None # Path to a KenLM binary - lm_workers: 1 - offsets: False # Returns time offset information - top_paths: 1 - - cuda: True # Use cuda to run model \ No newline at end of file diff --git a/config/test.yaml b/config/test.yaml deleted file mode 100644 index 2589e15..0000000 --- a/config/test.yaml +++ /dev/null @@ -1,9 +0,0 @@ -test: - test_manifest: "" # Path to test manifest csv - - batch_size: 32 # Batch size for testing - num_workers: 4 # Number of workers used in loading - verbose: True # Print out decoded output and error of each sample - save_output: Trur # Saves output of model from test - output_path: "" # Where to save raw acoustic output - diff --git a/config/train.yaml b/config/train.yaml deleted file mode 100644 index 25be72c..0000000 --- a/config/train.yaml +++ /dev/null @@ -1,49 +0,0 @@ -train: - train_manifest: 'examples/manifests/train_manifest.csv' - val_manifest: 'examples/manifests/val_manifest.csv' - labels_path: 'examples/labels.json' # Contains all characters for transcription - log_dir: 'logs' # Location for log files - def_dir: 'examples/checkpoints/', # Default location to save/load models - - load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune - - sample_rate: 16000 # Sample rate - window_size: 0.02 # Window size for spectrogram in seconds - window_stride: 0.01 # Window stride for spectrogram in seconds - window: 'hamming' # Window type for spectrogram generation - - batch_size: 32 # Batch size for training - hidden_size: 800 # Hidden size of RNNs - hidden_layers: 5 # Number of RNN layers - rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported - - max_epochs: 70 # Number of training epochs - learning_rate: 3e-4 # Initial learning rate - momentum: 0.9 # Momentum - max_norm: 800 # Norm cutoff to prevent explosion of gradients - learning_anneal: 1.1n # Annealing applied to learning rate every epoch - sortaGrad: True # Turn on ordering of dataset on sequence length for the first epoch - - checkpoint: True # Enables checkpoint saving of model - checkpoint_per_epoch: 1 # Save checkpoint per x epochs - silent: False # Turn on progress tracking per iteration - verbose: False # Turn on verbose progress tracking - continue: False # Continue training with a pre-trained model - finetune: False # Finetune a pre-trained model - - num_data_workers: 8 # Number of workers used in data-loading - augment: False # Use random tempo and gain perturbations - shuffle: True # Turn on shuffling and sample from dataset based on sequence length (smallest to largest) - - seed: 123456 # Seed to generators - cuda: True # Use cuda to train model - half_precision: Trues # Uses half precision to train a model - apex: True # Uses mixed precision to train a model - static_loss_scaling: False # Static loss scale for mixed precision - dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision - - dist_url: 'tcp://127.0.0.1:1550' # URL used to set up distributed training - dist_backend: 'nccl' # Distributed backend - world_size: 1 # Number of distributed processes - rank: 0 # The rank of the current process - gpu_rank: 0 # If using distributed parallel for multi_gpu, sets the GPU for the process \ No newline at end of file diff --git a/decoders/__init__.py b/decoders/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/decoders/beam_decoder.py b/decoders/beam_decoder.py deleted file mode 100644 index c44d164..0000000 --- a/decoders/beam_decoder.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python -# ---------------------------------------------------------------------------- -# Copyright 2015-2016 Nervana Systems Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- -# Modified to support pytorch Tensors -import torch - -from decoders.decoder import Decoder - - -class BeamCTCDecoder(Decoder): - def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, - num_processes=4, blank_index=0): - super(BeamCTCDecoder, self).__init__(labels) - try: - from ctcdecode import CTCBeamDecoder - except ImportError: - raise ImportError("BeamCTCDecoder requires paddledecoder package.") - self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, - num_processes, blank_index) - - def convert_to_strings(self, out, seq_len): - results = [] - for b, batch in enumerate(out): - utterances = [] - for p, utt in enumerate(batch): - size = seq_len[b][p] - if size > 0: - transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size])) - else: - transcript = '' - utterances.append(transcript) - results.append(utterances) - return results - - def convert_tensor(self, offsets, sizes): - results = [] - for b, batch in enumerate(offsets): - utterances = [] - for p, utt in enumerate(batch): - size = sizes[b][p] - if sizes[b][p] > 0: - utterances.append(utt[0:size]) - else: - utterances.append(torch.tensor([], dtype=torch.int)) - results.append(utterances) - return results - - def decode(self, probs, sizes=None): - """ - Decodes probability output using ctcdecode package. - Arguments: - probs: Tensor of character probabilities, where probs[c,t] - is the probability of character c at time t - sizes: Size of each sequence in the mini-batch - Returns: - string: sequences of the model's best guess for the transcription - """ - probs = probs.cpu() - out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes) - - strings = self.convert_to_strings(out, seq_lens) - offsets = self.convert_tensor(offsets, seq_lens) - return strings, offsets \ No newline at end of file diff --git a/decoders/decoder.py b/decoders/decoder.py deleted file mode 100644 index 99a8193..0000000 --- a/decoders/decoder.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -# ---------------------------------------------------------------------------- -# Copyright 2015-2016 Nervana Systems Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- -# Modified to support pytorch Tensors - -import Levenshtein as Lev - - -class Decoder(object): - """ - Basic decoder class from which all other decoders inherit. Implements several - helper functions. Subclasses should implement the decode() method. - - Arguments: - labels (string): mapping from integers to characters. - blank_index (int, optional): index for the blank '_' character. Defaults to 0. - space_index (int, optional): index for the space ' ' character. Defaults to 28. - """ - - def __init__(self, labels, blank_index=0): - # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#" - self.labels = labels - self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)]) - self.blank_index = blank_index - space_index = len(labels) # To prevent errors in decode, we add an out of bounds index for the space - if ' ' in labels: - space_index = labels.index(' ') - self.space_index = space_index - - def wer(self, s1, s2): - """ - Computes the Word Error Rate, defined as the edit distance between the - two provided sentences after tokenizing to words. - Arguments: - s1 (string): space-separated sentence - s2 (string): space-separated sentence - """ - - # build mapping of words to integers - b = set(s1.split() + s2.split()) - word2char = dict(zip(b, range(len(b)))) - - # map the words to a char array (Levenshtein packages only accepts - # strings) - w1 = [chr(word2char[w]) for w in s1.split()] - w2 = [chr(word2char[w]) for w in s2.split()] - - return Lev.distance(''.join(w1), ''.join(w2)) - - def cer(self, s1, s2): - """ - Computes the Character Error Rate, defined as the edit distance. - - Arguments: - s1 (string): space-separated sentence - s2 (string): space-separated sentence - """ - s1, s2, = s1.replace(' ', ''), s2.replace(' ', '') - return Lev.distance(s1, s2) - - def decode(self, probs, sizes=None): - """ - Given a matrix of character probabilities, returns the decoder's - best guess of the transcription - - Arguments: - probs: Tensor of character probabilities, where probs[c,t] - is the probability of character c at time t - sizes(optional): Size of each sequence in the mini-batch - Returns: - string: sequence of the model's best guess for the transcription - """ - raise NotImplementedError diff --git a/decoders/greedy_decoder.py b/decoders/greedy_decoder.py deleted file mode 100644 index c14884f..0000000 --- a/decoders/greedy_decoder.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python -# ---------------------------------------------------------------------------- -# Copyright 2015-2016 Nervana Systems Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- -# Modified to support pytorch Tensors - -import torch - -from decoders.decoder import Decoder - - -class GreedyDecoder(Decoder): - def __init__(self, labels, blank_index=0): - super(GreedyDecoder, self).__init__(labels, blank_index) - - def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False): - """Given a list of numeric sequences, returns the corresponding strings""" - strings = [] - offsets = [] if return_offsets else None - for x in range(len(sequences)): - seq_len = sizes[x] if sizes is not None else len(sequences[x]) - string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions) - strings.append([string]) # We only return one path - if return_offsets: - offsets.append([string_offsets]) - if return_offsets: - return strings, offsets - else: - return strings - - def process_string(self, sequence, size, remove_repetitions=False): - string = '' - offsets = [] - for i in range(size): - char = self.int_to_char[sequence[i].item()] - if char != self.int_to_char[self.blank_index]: - # if this char is a repetition and remove_repetitions=true, then skip - if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]: - pass - elif char == self.labels[self.space_index]: - string += ' ' - offsets.append(i) - else: - string = string + char - offsets.append(i) - return string, torch.tensor(offsets, dtype=torch.int) - - def decode(self, probs, sizes=None): - """ - Returns the argmax decoding given the probability matrix. Removes - repeated elements in the sequence, as well as blanks. - - Arguments: - probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim - sizes(optional): Size of each sequence in the mini-batch - Returns: - strings: sequences of the model's best guess for the transcription on inputs - offsets: time step per character predicted - """ - _, max_probs = torch.max(probs, 2) - strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes, - remove_repetitions=True, return_offsets=True) - return strings, offsets diff --git a/infer.py b/infer.py deleted file mode 100644 index 8fe03d2..0000000 --- a/infer.py +++ /dev/null @@ -1,24 +0,0 @@ -import argparse -import os -import wave -from typing import Dict - -import yaml - -from modelwrapper import ModelWrapper - -parser = argparse.ArgumentParser(description='ASR inference') -parser.add_argument('--config', metavar='DIR', - help='Path to inference config file', default='config/infer.yaml') - -if __name__ == '__main__': - args = parser.parse_args() - with open(args.config, 'r') as file: - config = yaml.load(file) - config_dict: Dict = config["infer"] - model = ModelWrapper(**config_dict) - if "wave_path" in config_dict.keys() and os.path.isfile(config_dict.get("wave_path")): - sound = wave.open(config_dict.get("wave_path")) - print(model.infer(sound)) - else: - print("Wave file not found!") diff --git a/loader.py b/loader.py deleted file mode 100644 index 00b06d1..0000000 --- a/loader.py +++ /dev/null @@ -1,248 +0,0 @@ -# ---------------------------------------------------------------------------- -# Based on SeanNaren's deepspeech.pytorch: -# https://github.com/SeanNaren/deepspeech.pytorch -# ---------------------------------------------------------------------------- - -import math -import warnings -from typing import Tuple - -import librosa -import numpy as np -import torch -import torchaudio -from scipy import signal -from torch.utils.data import Dataset, DataLoader, Sampler -from torch.distributed import get_rank -from torch.distributed import get_world_size - -windows = {"bartlett": torch.bartlett_window, - "blackman": torch.blackman_window, - "hamming": torch.hamming_window, - "hann": torch.hann_window} - -windows_legacy = {'hamming': signal.hamming, - 'hann': signal.hann, - 'blackman': signal.blackman, - 'bartlett': signal.bartlett} - - -class DataProcessor(object): - def __init__(self, audio_conf, labels="abc", normalize=False, augment=False, legacy=True): - """ - Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by - a comma. Each new line is a different sample. Example below: - - /path/to/audio.wav,/path/to/audio.txt - ... - - :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds - :param labels: String containing all the possible characters to map to - :param normalize: Apply standard mean and deviation normalization to audio tensor - :param augment(default False): Apply random tempo and gain perturbations - """ - self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) - self.window_stride = audio_conf["window_stride"] - self.window_size = audio_conf["window_size"] - self.sample_rate = audio_conf["sample_rate"] - self.window = windows_legacy.get(audio_conf["window"], windows_legacy["hamming"]) if legacy else windows.get(audio_conf["window"], windows["hamming"]) - self.normalize = normalize - self.augment = augment - self.legacy = legacy - self.transform = torchaudio.transforms.Spectrogram(n_fft=int(self.sample_rate * self.window_size), - hop=int(self.sample_rate * self.window_stride), - window=self.window, normalize=self.normalize) - - @staticmethod - def retrieve_file(audio_path, legacy=True): - sound, sample_rate = torchaudio.load(audio_path) - if legacy: - sound = sound.numpy().T - if len(sound.shape) > 1: - if sound.shape[1] == 1: - sound = sound.squeeze() - else: - sound = sound.mean(axis=1) - return sound, sample_rate - - @staticmethod - def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)): - """ - Changes tempo and gain of the wave - """ - warnings.warn("Augmentation is not implemented") # TODO: Implement - return sound - - def parse_audio(self, audio_path): - sound, sample_rate = self.retrieve_file(audio_path, self.legacy) - if sample_rate != self.sample_rate: - raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") - - if self.augment: - sound = self.augment_audio(sound) - - if self.legacy: - n_fft = int(self.sample_rate * self.window_size) - win_length = n_fft - hop_length = int(self.sample_rate * self.window_stride) - # STFT - D = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length, - win_length=win_length, window=self.window) - spectrogram, phase = librosa.magphase(D) - # S = log(S+1) - - spectrogram = torch.FloatTensor(np.log1p(spectrogram)) - else: - # TODO: Why these are different from librosa.stft? - sound = sound.cuda() - spectrogram = self.transform(sound)[-1, :, :].transpose(0, 1) - - # spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()), - # n_fft=int(self.sample_rate * self.window_size), - # hop_length=int(self.sample_rate * self.window_stride), - # win_length=int(self.sample_rate * self.window_size), - # window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1] - - if self.normalize: - mean = spectrogram.mean() - std = spectrogram.std() - spectrogram.add_(-mean) - spectrogram.div_(std) - - return spectrogram - - def parse_transcript(self, transcript_path): - with open(transcript_path, 'r', encoding='utf8') as transcript_file: - transcript = transcript_file.read().replace('\n', '') - # TODO: Is it fast enough? - transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) - return transcript - - -class AudioDataset(Dataset): - def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False, legacy=True): - """ - Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by - a comma. Each new line is a different sample. Example below: - - /path/to/audio.wav,/path/to/audio.txt - ... - - :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds - :param manifest_filepath: Path to manifest csv as describe above - :param labels: String containing all the possible characters to map to - :param normalize: Apply standard mean and deviation normalization to audio tensor - :param augment(default False): Apply random tempo and gain perturbations - """ - super(AudioDataset, self).__init__() - with open(manifest_filepath) as f: - ids = f.readlines() - ids = [x.strip().split(',') for x in ids] - self.ids = ids - self.size = len(ids) - self.processor = DataProcessor(audio_conf, labels, normalize, augment, legacy) - - def __getitem__(self, index): - sample = self.ids[index] - audio_path, transcript_path = sample[0], sample[1] - - spectrogram = self.processor.parse_audio(audio_path) - transcript = self.processor.parse_transcript(transcript_path) - - return spectrogram, transcript - - def __len__(self): - return self.size - - -# TODO: Optimise -def _collate_fn(batch): - batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) - longest_sample = batch[0][0] - freq_size, max_seqlength = longest_sample.size() - minibatch_size = len(batch) - inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) - input_percentages = torch.FloatTensor(minibatch_size) - target_sizes = np.zeros(minibatch_size, dtype=np.int32) - - # TODO: Numpy broadcasting magic - targets = [] - - for x in range(minibatch_size): - inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) - input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) - target_sizes[x] = len(batch[x][1]) - targets.extend(batch[x][1]) - - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) - - -class AudioDataLoader(DataLoader): - def __init__(self, *args, **kwargs): - """ - Creates a data loader for AudioDatasets. - """ - super(AudioDataLoader, self).__init__(*args, **kwargs) - self.collate_fn = _collate_fn - - -class BucketingSampler(Sampler): - def __init__(self, data_source, batch_size=1): - """ - Samples batches assuming they are in order of size to batch similarly sized samples together. - """ - super(BucketingSampler, self).__init__(data_source) - self.data_source = data_source - ids = list(range(0, len(data_source))) - # TODO: Optimise - self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)] - - def __iter__(self): - for ids in self.bins: - np.random.shuffle(ids) - yield ids - - def __len__(self): - return len(self.bins) - - def shuffle(self, epoch): - np.random.shuffle(self.bins) - - -# TODO: Optimise -class DistributedBucketingSampler(Sampler): - def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None): - """ - Samples batches assuming they are in order of size to batch similarly sized samples together. - """ - super(DistributedBucketingSampler, self).__init__(data_source) - if num_replicas is None: - num_replicas = get_world_size() - if rank is None: - rank = get_rank() - self.data_source = data_source - self.ids = list(range(0, len(data_source))) - self.batch_size = batch_size - self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)] - self.num_replicas = num_replicas - self.rank = rank - self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas)) - self.total_size = self.num_samples * self.num_replicas - - def __iter__(self): - offset = self.rank - # add extra samples to make it evenly divisible - bins = self.bins + self.bins[:(self.total_size - len(self.bins))] - assert len(bins) == self.total_size - samples = bins[offset::self.num_replicas] # Get every Nth bin, starting from rank - return iter(samples) - - def __len__(self): - return self.num_samples - - def shuffle(self, epoch): - # deterministically shuffle based on epoch - g = torch.Generator() - g.manual_seed(epoch) - bin_ids = list(torch.randperm(len(self.bins), generator=g)) - self.bins = [self.bins[i] for i in bin_ids] diff --git a/models/__init__.py b/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/models/deepspeech2.py b/models/deepspeech2.py deleted file mode 100644 index dcfa100..0000000 --- a/models/deepspeech2.py +++ /dev/null @@ -1,269 +0,0 @@ -# ---------------------------------------------------------------------------- -# Based on SeanNaren's deepspeech.pytorch: -# https://github.com/SeanNaren/deepspeech.pytorch -# ---------------------------------------------------------------------------- - -import math -from collections import OrderedDict - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -supported_rnns = { - 'lstm': nn.LSTM, - 'rnn': nn.RNN, - 'gru': nn.GRU -} -supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items()) - - -class SequenceWise(nn.Module): - def __init__(self, module): - """ - Collapses input of dim T*N*H to (T*N)*H, and applies to a module. - Allows handling of variable sequence lengths and minibatch sizes. - :param module: Module to apply input to. - """ - super(SequenceWise, self).__init__() - self.module = module - - def forward(self, x): - t, n = x.size(0), x.size(1) - x = x.view(t * n, -1) - x = self.module(x) - x = x.view(t, n, -1) - return x - - def __repr__(self): - tmpstr = self.__class__.__name__ + ' (\n' - tmpstr += self.module.__repr__() - tmpstr += ')' - return tmpstr - - -class MaskConv(nn.Module): - def __init__(self, seq_module): - """ - Adds padding to the output of the module based on the given lengths. This is to ensure that the - results of the model do not change when batch sizes change during inference. - Input needs to be in the shape of (BxCxDxT) - :param seq_module: The sequential module containing the conv stack. - """ - super(MaskConv, self).__init__() - self.seq_module = seq_module - - def forward(self, x, lengths): - """ - :param x: The input of size BxCxDxT - :param lengths: The actual length of each sequence in the batch - :return: Masked output from the module - """ - for module in self.seq_module: - x = module(x) - mask = torch.ByteTensor(x.size()).fill_(0) - if x.is_cuda: - mask = mask.cuda() - for i, length in enumerate(lengths): - length = length.item() - if (mask[i].size(2) - length) > 0: - mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1) - x = x.masked_fill(mask, 0) - return x, lengths - - -class InferenceBatchSoftmax(nn.Module): - def forward(self, input_): - if not self.training: - return F.softmax(input_, dim=-1) - else: - return input_ - - -class BatchRNN(nn.Module): - def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True): - super(BatchRNN, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None - self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, - bidirectional=True, bias=True) - - def flatten_parameters(self): - self.rnn.flatten_parameters() - - def forward(self, x, output_lengths): - if self.batch_norm is not None: - x = self.batch_norm(x) - x = nn.utils.rnn.pack_padded_sequence(x, output_lengths) - x, h = self.rnn(x) - x, _ = nn.utils.rnn.pad_packed_sequence(x) - if self.bidirectional: - x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum - return x - - -class DeepSpeech2(nn.Module): - def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5, audio_conf=None, - bidirectional=True): - super(DeepSpeech2, self).__init__() - - # model metadata needed for serialization/deserialization - if audio_conf is None: - audio_conf = {} - self.version = '0.0.1' - self.hidden_size = rnn_hid_size - self.hidden_layers = nb_layers - self.rnn_type = rnn_type - self.audio_conf = audio_conf or {} - self.labels = labels - self.bidirectional = bidirectional - # self.mixed_precision = mixed_precision - - sample_rate = self.audio_conf.get("sample_rate", 16000) - window_size = self.audio_conf.get("window_size", 0.02) - num_classes = len(self.labels) - - self.conv = MaskConv(nn.Sequential( - nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), - nn.BatchNorm2d(32), - nn.Hardtanh(0, 20, inplace=True), - nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), - nn.BatchNorm2d(32), - nn.Hardtanh(0, 20, inplace=True) - )) - # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 - rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1) - rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1) - rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1) - rnn_in_size *= 32 - - rnns = [('0', BatchRNN(input_size=rnn_in_size, hidden_size=rnn_hid_size, rnn_type=rnn_type, batch_norm=False))] - rnns.extend([(f"{x + 1}", BatchRNN(input_size=rnn_hid_size, hidden_size=rnn_hid_size, rnn_type=rnn_type)) - for x in range(nb_layers - 1)]) - self.rnns = nn.Sequential(OrderedDict(rnns)) - - fully_connected = nn.Sequential( - nn.BatchNorm1d(rnn_hid_size), - nn.Linear(rnn_hid_size, num_classes, bias=False) - ) - - self.fc = nn.Sequential( - SequenceWise(fully_connected), - ) - - self.inference_softmax = InferenceBatchSoftmax() - - def forward(self, x, lengths): - # if x.is_cuda and self.mixed_precision: - # x = x.half() - lengths = lengths.cpu().int() - output_lengths = self.get_seq_lens(lengths) - x, _ = self.conv(x, output_lengths) - - sizes = x.size() - x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension - x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH - - for rnn in self.rnns: - x = rnn(x, output_lengths) - - if not self.bidirectional: # no need for lookahead layer in bidirectional - x = self.lookahead(x) - - x = self.fc(x) - x = x.transpose(0, 1) - # identity in training mode, softmax in eval mode - x = self.inference_softmax(x) - return x, output_lengths - - def get_seq_lens(self, input_length): - """ - Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable - containing the size sequences that will be output by the network. - :param input_length: 1D Tensor - :return: 1D Tensor scaled by model - """ - seq_len = input_length - for m in self.conv.modules(): - if type(m) == nn.modules.conv.Conv2d: - seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1) - return seq_len.int() - - @classmethod - def load_model(cls, path): - package = torch.load(path, map_location=lambda storage, loc: storage) - model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], - labels=package['labels'], audio_conf=package['audio_conf'], - rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True)) - model.load_state_dict(package['state_dict']) - for x in model.rnns: - x.flatten_parameters() - - return model - - @classmethod - def load_model_package(cls, package): - model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], - labels=package['labels'], audio_conf=package['audio_conf'], - rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True)) - model.load_state_dict(package['state_dict']) - - return model - - @staticmethod - def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None, - cer_results=None, wer_results=None, avg_loss=None, meta=None): - package = { - 'version': model.version, - 'hidden_size': model.hidden_size, - 'hidden_layers': model.hidden_layers, - 'rnn_type': supported_rnns_inv.get(model.rnn_type, model.rnn_type.__name__.lower()), - 'audio_conf': model.audio_conf, - 'labels': model.labels, - 'state_dict': model.state_dict(), - 'bidirectional': model.bidirectional - } - if optimizer is not None: - package['optim_dict'] = optimizer.state_dict() - if avg_loss is not None: - package['avg_loss'] = avg_loss - if epoch is not None: - package['epoch'] = epoch + 1 # increment for readability - if iteration is not None: - package['iteration'] = iteration - if loss_results is not None: - package['loss_results'] = loss_results - package['cer_results'] = cer_results - package['wer_results'] = wer_results - if meta is not None: - package['meta'] = meta - return package - - @staticmethod - def get_param_size(model): - params = 0 - for p in model.parameters(): - tmp = 1 - for x in p.size(): - tmp *= x - params += tmp - return params - - def __repr__(self): - rep = f"DeepSpeech2 version: {self.version}\n" + \ - "=======================================" + \ - "Recurrent Neural Network Properties\n" + \ - f" RNN Type: \t{self.rnn_type.__name__.lower()}\n" + \ - f" RNN Layers:\t{self.hidden_layers}\n" + \ - f" RNN Size: \t{self.hidden_size}\n" + \ - f" Classes: \t{len(self.labels)}\n" + \ - "---------------------------------------\n" + \ - "Model Features\n" + \ - f" Labels: \t{self.labels}\n" + \ - f" Sample Rate: \t{self.audio_conf.get('sample_rate', 'n/a')}\n" + \ - f" Window Type: \t{self.audio_conf.get('window', 'n/a')}\n" + \ - f" Window Size: \t{self.audio_conf.get('window_size', 'n/a')}\n" + \ - f" Window Stride:\t{self.audio_conf.get('window_stride', 'n/a')}" - return rep diff --git a/modelwrapper.py b/modelwrapper.py deleted file mode 100644 index b313603..0000000 --- a/modelwrapper.py +++ /dev/null @@ -1,413 +0,0 @@ -import os.path -from random import random -from datetime import datetime - -import numpy as np -import torch - -from models.deepspeech2 import DeepSpeech2 - -import json -import os -import random -import time - -import torch.distributed as dist -import torch.utils.data.distributed - -try: - from apex.fp16_utils import FP16_Optimizer - from apex.parallel import DistributedDataParallel -except Exception as e: - print(f"Apex import failed: {e}") - -from tqdm import tqdm -from warpctc_pytorch import CTCLoss - -from loader import AudioDataLoader, AudioDataset, BucketingSampler, DistributedBucketingSampler -from decoders.greedy_decoder import GreedyDecoder -from utils import convert_model_to_half, reduce_tensor, check_loss - -models = {"deepspeech2": DeepSpeech2} - -sttime = datetime.now() -print(f"Time of start: {sttime}") - - -def to_np(x): - return x.cpu().numpy() - - -class AverageMeter(object): - """Computes and stores the average and current value""" - - def __init__(self): - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - -class ModelWrapper(object): - DEF_PATH = "examples/checkpoints/" - - def __init__(self, **kwargs): - self.model = kwargs.get("model", models["deepspeech2"]) - - if kwargs.get("continue"): - path = kwargs.get("from", ModelWrapper.get_default_path()) - self.model.package = torch.load(path, map_location=lambda storage, loc: storage) - self.model.load_model(path) - - self.save_path = kwargs.get("save", ModelWrapper.DEF_PATH + str(datetime.now().timestamp())) - - self.cuda = kwargs.get("cuda") - self.apex = kwargs.get("apex") if self.cuda else False - self.half = self.apex if self.apex else kwargs.get("half") - - def train(self, seed, cuda, mixed_precision, world_size, gpu_rank, rank, save_folder, dist_backend, dist_url, - epochs, continue_from, finetune, labels_path, sample_rate, window_size, window_stride, window, - hidden_size, hidden_layers, labels, supported_rnns, bidirectional, no_shuffle, no_sorta_grad, rnn_type, - train_manifest, augment, batch_size, num_workers, momentum, lr, static_loss_scale, dynamic_loss_scale, - val_manifest, max_norm, silent, checkpoint_per_batch, checkpoint, learning_anneal, model_path): - # Set seeds for determinism - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - np.random.seed(seed) - random.seed(seed) - - device = torch.device("cuda" if cuda else "cpu") - if mixed_precision and not cuda: - raise ValueError('If using mixed precision training, CUDA must be enabled!') - distributed = world_size > 1 - main_proc = True - device = torch.device("cuda" if cuda else "cpu") - if distributed: - if gpu_rank: - torch.cuda.set_device(int(gpu_rank)) - dist.init_process_group(backend=dist_backend, init_method=dist_url, - world_size=world_size, rank=rank) - main_proc = rank == 0 # Only the first proc should save models - save_folder = save_folder - os.makedirs(save_folder, exist_ok=True) # Ensure save folder exists - - loss_results, cer_results, wer_results = torch.Tensor(epochs), torch.Tensor(epochs), torch.Tensor(epochs) - best_wer = None - - avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None - if continue_from: # Starting from previous model - print("Loading checkpoint model %s" % continue_from) - - labels = self.model.labels - audio_conf = self.model.audio_conf - if not finetune: # Don't want to restart training - optim_state = self.model.package['optim_dict'] - start_epoch = int(self.model.get('epoch', 1)) - 1 # Index start at 0 for training - start_iter = self.model.package.get('iteration', None) - if start_iter is None: - start_epoch += 1 # We saved model after epoch finished, start at the next epoch. - start_iter = 0 - else: - start_iter += 1 - avg_loss = int(self.model.package.get('avg_loss', 0)) - loss_results, cer_results, wer_results = self.model.package['loss_results'], \ - self.model.package['cer_results'], \ - self.model.package['wer_results'] - else: - with open(labels_path) as label_file: - labels = str(''.join(json.load(label_file))) - - audio_conf = dict(sample_rate=sample_rate, - window_size=window_size, - window_stride=window_stride, - window=window) - - rnn_type = rnn_type.lower() - assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" - model = self.model(rnn_hidden_size=hidden_size, - nb_layers=hidden_layers, - labels=labels, - rnn_type=supported_rnns[rnn_type], - audio_conf=audio_conf, - bidirectional=bidirectional, - mixed_precision=mixed_precision) - - decoder = GreedyDecoder(labels) - - train_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels, - normalize=False, augment=augment) - test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels, - normalize=False, augment=False) - if not distributed: - train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) - else: - train_sampler = DistributedBucketingSampler(train_dataset, batch_size=batch_size, - num_replicas=world_size, rank=rank) - - train_loader = AudioDataLoader(train_dataset, num_workers=num_workers, batch_sampler=train_sampler) - test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers) - - if (not no_shuffle and start_epoch != 0) or no_sorta_grad: - print("Shuffling batches for the following epochs") - train_sampler.shuffle(start_epoch) - - model = model.to(device) - if mixed_precision: - model = convert_model_to_half(model) - parameters = model.parameters() - optimizer = torch.optim.SGD(parameters, lr=lr, - momentum=momentum, nesterov=True, weight_decay=1e-5) - if distributed: - model = DistributedDataParallel(model) - if mixed_precision: - optimizer = FP16_Optimizer(optimizer, - static_loss_scale=static_loss_scale, - dynamic_loss_scale=dynamic_loss_scale) - if optim_state is not None: - optimizer.load_state_dict(optim_state) - print(model) - print("Number of parameters: %d" % self.model.get_param_size(model)) - - criterion = CTCLoss() - batch_time = AverageMeter() - data_time = AverageMeter() - losses = AverageMeter() - - for epoch in range(start_epoch, epochs): - model.train() - end = time.time() - start_epoch_time = time.time() - for i, (data) in enumerate(train_loader, start=start_iter): - if i == len(train_sampler): - break - inputs, targets, input_percentages, target_sizes = data - input_sizes = input_percentages.mul_(int(inputs.size(3))).int() - # measure data loading time - data_time.update(time.time() - end) - inputs = inputs.to(device) - - out, output_sizes = model(inputs, input_sizes) - out = out.transpose(0, 1) # TxNxH - - float_out = out.float() # ensure float32 for loss - loss = criterion(float_out, targets, output_sizes, target_sizes).to(device) - loss = loss / inputs.size(0) # average the loss by minibatch - - if distributed: - loss = loss.to(device) - loss_value = reduce_tensor(loss, world_size).item() - else: - loss_value = loss.item() - - # Check to ensure valid loss was calculated - valid_loss, error = check_loss(loss, loss_value) - if valid_loss: - optimizer.zero_grad() - # compute gradient - if mixed_precision: - optimizer.backward(loss) - optimizer.clip_master_grads(max_norm) - else: - loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) - optimizer.step() - else: - print(error) - print('Skipping grad update') - loss_value = 0 - - avg_loss += loss_value - losses.update(loss_value, inputs.size(0)) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - if not silent: - print('Epoch: [{0}][{1}/{2}]\t' - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' - 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' - 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( - (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=data_time, - loss=losses)) - if checkpoint_per_batch > 0 and i > 0 and (i + 1) % checkpoint_per_batch == 0 and main_proc: - file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (save_folder, epoch + 1, i + 1) - print("Saving checkpoint model to %s" % file_path) - torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i, - loss_results=loss_results, - wer_results=wer_results, cer_results=cer_results, - avg_loss=avg_loss), - file_path) - del loss, out, float_out - - avg_loss /= len(train_sampler) - - epoch_time = time.time() - start_epoch_time - print(f"Elapsed time from start: {datetime.now() - sttime}") - print('Training Summary Epoch: [{0}]\t' - 'Time taken (s): {epoch_time:.0f}\t' - 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss)) - - start_iter = 0 # Reset start iteration for next epoch - total_cer, total_wer = 0, 0 - model.eval() - with torch.no_grad(): - for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): - inputs, targets, input_percentages, target_sizes = data - input_sizes = input_percentages.mul_(int(inputs.size(3))).int() - inputs = inputs.to(device) - - # unflatten targets - split_targets = [] - offset = 0 - for size in target_sizes: - split_targets.append(targets[offset:offset + size]) - offset += size - - out, output_sizes = model(inputs, input_sizes) - - decoded_output, _ = decoder.decode(out, output_sizes) - target_strings = decoder.convert_to_strings(split_targets) - wer, cer = 0, 0 - for x in range(len(target_strings)): - transcript, reference = decoded_output[x][0], target_strings[x][0] - wer += decoder.wer(transcript, reference) / float(len(reference.split())) - cer += decoder.cer(transcript, reference) / float(len(reference)) - total_cer += cer - total_wer += wer - del out - wer = total_wer / len(test_loader.dataset) - cer = total_cer / len(test_loader.dataset) - wer *= 100 - cer *= 100 - loss_results[epoch] = avg_loss - wer_results[epoch] = wer - cer_results[epoch] = cer - print('Validation Summary Epoch: [{0}]\t' - 'Average WER {wer:.3f}\t' - 'Average CER {cer:.3f}\t'.format( - epoch + 1, wer=wer, cer=cer)) - - values = { - 'loss_results': loss_results, - 'cer_results': cer_results, - 'wer_results': wer_results - } - - if main_proc and checkpoint: - file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) - torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, - wer_results=wer_results, cer_results=cer_results), - file_path) - # anneal lr - param_groups = optimizer.optimizer.param_groups if mixed_precision else optimizer.param_groups - for g in param_groups: - g['lr'] = g['lr'] / learning_anneal - print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr'])) - - if main_proc and (best_wer is None or best_wer > wer): - print("Found better validated model, saving to %s" % model_path) - torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, - wer_results=wer_results, cer_results=cer_results), model_path) - best_wer = wer - - avg_loss = 0 - if not no_shuffle: - print("Shuffling batches...") - train_sampler.shuffle(epoch) - - def validate(self): - pass - - def test(self): - torch.set_grad_enabled(False) - device = torch.device("cuda" if cuda else "cpu") - model = load_model(device, model_path, cuda) - - if decoder == "beam": - from decoder import BeamCTCDecoder - - decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, alpha=alpha, beta=beta, - cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, - beam_width=beam_width, num_processes=lm_workers) - elif decoder == "greedy": - decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) - else: - decoder = None - target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) - test_dataset = AudioDataset(audio_conf=model.audio_conf, manifest_filepath=test_manifest, - labels=model.labels, normalize=True) - test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, - num_workers=num_workers) - total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 - output_data = [] - for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): - inputs, targets, input_percentages, target_sizes = data - input_sizes = input_percentages.mul_(int(inputs.size(3))).int() - inputs = inputs.to(device) - # unflatten targets - split_targets = [] - offset = 0 - for size in target_sizes: - split_targets.append(targets[offset:offset + size]) - offset += size - - out, output_sizes = model(inputs, input_sizes) - - if save_output: - # add output to data array, and continue - output_data.append((out.cpu().numpy(), output_sizes.numpy())) - - decoded_output, _ = decoder.decode(out, output_sizes) - target_strings = target_decoder.convert_to_strings(split_targets) - for x in range(len(target_strings)): - transcript, reference = decoded_output[x][0], target_strings[x][0] - wer_inst = decoder.wer(transcript, reference) - cer_inst = decoder.cer(transcript, reference) - total_wer += wer_inst - total_cer += cer_inst - num_tokens += len(reference.split()) - num_chars += len(reference) - if verbose: - print("Ref:", reference.lower()) - print("Hyp:", transcript.lower()) - print("WER:", float(wer_inst) / len(reference.split()), "CER:", float(cer_inst) / len(reference), - "\n") - - wer = float(total_wer) / num_tokens - cer = float(total_cer) / num_chars - - print('Test Summary \t' - 'Average WER {wer:.3f}\t' - 'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100)) - if save_output: - np.save(output_path, output_data) - - def infer(self, sound): - pass - - @staticmethod - def get_default_path(def_path: str) -> str: - """ - Returns the path to the latest checkpoint in the default location - :param def_path: default path where checkpoints are stored - :return: the path to the latest checkpoint - """ - latest_subdir = max([os.path.join(def_path, d) for d in os.listdir(def_path)], key=os.path.getmtime) - default = latest_subdir + "/final.pth" - return default - - def print_training_info(self, epoch, loss, cer, wer): - print(f"\nTraining Information\n " + \ - f"- Epoch:\t{epoch}\n " + \ - f"- Current Loss:\t{loss}\n " + \ - f"- Current CER: \t{cer}\n" + \ - f"- Current WER: \t{wer}") diff --git a/test.py b/test.py deleted file mode 100644 index f4d8a39..0000000 --- a/test.py +++ /dev/null @@ -1,18 +0,0 @@ -import argparse -from typing import Dict - -import yaml - -from modelwrapper import ModelWrapper - -parser = argparse.ArgumentParser(description='ASR testing') -parser.add_argument('--config', metavar='DIR', - help='Path to test config file', default='config/test.yaml') - -if __name__ == '__main__': - args = parser.parse_args() - with open(args.config, 'r') as file: - config = yaml.load(file) - config_dict: Dict = config["test"] - model = ModelWrapper(**config_dict) - model.test() diff --git a/train.py b/train.py deleted file mode 100644 index a763a47..0000000 --- a/train.py +++ /dev/null @@ -1,18 +0,0 @@ -import argparse -from typing import Dict - -import yaml - -from modelwrapper import ModelWrapper - -parser = argparse.ArgumentParser(description='ASR training') -parser.add_argument('--config', metavar='DIR', - help='Path to train config file', default='config/train.yaml') - -if __name__ == '__main__': - args = parser.parse_args() - with open(args.config, 'r') as file: - config = yaml.load(file) - config_dict: Dict = config["train"] - model = ModelWrapper(**config_dict) - model.train() diff --git a/utils.py b/utils.py deleted file mode 100644 index 5e219d1..0000000 --- a/utils.py +++ /dev/null @@ -1,51 +0,0 @@ -import torch -from apex.fp16_utils import BN_convert_float -import torch.distributed as dist - -from models.deepspeech2 import DeepSpeech2 - - -def convert_model_to_half(model): - """ - Converts model to half but keeps the batch norm layers in 32 bit for precision purposes - """ - old_model = model - new_model = BN_convert_float(model.half()) - del old_model # Delete previous non-half model - return new_model - - -def reduce_tensor(tensor, world_size, reduce_op_max=False): - rt = tensor.clone() - dist.all_reduce(rt, op=dist.reduce_op.MAX if reduce_op_max is True else dist.reduce_op.SUM) # Default to sum - if not reduce_op_max: - rt /= world_size - return rt - - -def check_loss(loss, loss_value): - """ - Check that warp-ctc loss is valid and will not break training - :return: Return if loss is valid, and the error in case it is not - """ - loss_valid = True - error = '' - if loss_value == float("inf") or loss_value == float("-inf"): - loss_valid = False - error = "WARNING: received an inf loss" - elif torch.isnan(loss).sum() > 0: - loss_valid = False - error = 'WARNING: received a nan loss, setting loss value to 0' - elif loss_value < 0: - loss_valid = False - error = "WARNING: received a negative loss" - return loss_valid, error - - -def load_model(device, model_path, is_cuda): - model = DeepSpeech.load_model(model_path) - model.eval() - model = model.to(device) - if is_cuda and model.mixed_precision: - model = convert_model_to_half(model) - return model From 70d4fc3396edbfd901a64fc65487db555a377a19 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sat, 25 May 2019 12:31:22 +0200 Subject: [PATCH 03/58] restructure code --- requirements.txt | 24 ++ setup.py | 9 + sonosco/STT_srv.py | 43 +++ sonosco/config/infer.yaml | 21 ++ sonosco/config/test.yaml | 9 + sonosco/config/train.yaml | 49 ++++ sonosco/decoders/__init__.py | 0 sonosco/decoders/beam_decoder.py | 75 ++++++ sonosco/decoders/decoder.py | 85 ++++++ sonosco/decoders/greedy_decoder.py | 74 ++++++ sonosco/infer.py | 24 ++ sonosco/loader.py | 248 +++++++++++++++++ sonosco/models/__init__.py | 0 sonosco/models/deepspeech2.py | 269 +++++++++++++++++++ sonosco/modelwrapper.py | 413 +++++++++++++++++++++++++++++ sonosco/test.py | 18 ++ sonosco/train.py | 18 ++ sonosco/utils.py | 51 ++++ 18 files changed, 1430 insertions(+) create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 sonosco/STT_srv.py create mode 100644 sonosco/config/infer.yaml create mode 100644 sonosco/config/test.yaml create mode 100644 sonosco/config/train.yaml create mode 100644 sonosco/decoders/__init__.py create mode 100644 sonosco/decoders/beam_decoder.py create mode 100644 sonosco/decoders/decoder.py create mode 100644 sonosco/decoders/greedy_decoder.py create mode 100644 sonosco/infer.py create mode 100644 sonosco/loader.py create mode 100644 sonosco/models/__init__.py create mode 100644 sonosco/models/deepspeech2.py create mode 100644 sonosco/modelwrapper.py create mode 100644 sonosco/test.py create mode 100644 sonosco/train.py create mode 100644 sonosco/utils.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c7d87ee --- /dev/null +++ b/requirements.txt @@ -0,0 +1,24 @@ +apex==0.1 +audioread==2.1.7 +cycler==0.10.0 +decorator==4.4.0 +joblib==0.13.2 +kiwisolver==1.1.0 +librosa==0.6.3 +llvmlite==0.28.0 +matplotlib==3.1.0 +numba==0.43.1 +numpy==1.16.3 +Pillow==6.0.0 +pyparsing==2.4.0 +python-dateutil==2.8.0 +resampy==0.2.1 +scikit-learn==0.21.2 +scipy==1.3.0 +six==1.12.0 +torch==1.1.0 +torchaudio==0.2 +torchvision==0.3.0 +tqdm==4.32.1 +pyyaml==5.1 +-e git+git@github.com:SeanNaren/warp-ctc.git@ab27454d0cf3f936a6e19165682fe2b759f3f8e5#egg=warpctc_pytorch&subdirectory=pytorch_binding diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..bdb1387 --- /dev/null +++ b/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup, find_packages + +setup( + name="sonosco", + description="Framework for training automatic speech recognition systems.", + author="The Roboy Gang", + packages=["sonosco"], + include_package_data=True, +) diff --git a/sonosco/STT_srv.py b/sonosco/STT_srv.py new file mode 100644 index 0000000..7013756 --- /dev/null +++ b/sonosco/STT_srv.py @@ -0,0 +1,43 @@ +import os + +from roboy_cognition_msgs.msg import RecognizedSpeech +from roboy_cognition_msgs.srv import RecognizeSpeech + +from asr_interface import IAsr +import rclpy +from rclpy.node import Node + + +class SonoscoROS2(Node): + def __init__(self): + super().__init__('stt') + self.publisher = self.create_publisher(RecognizedSpeech, '/roboy/cognition/speech/recognition') + self.srv = self.create_service(RecognizeSpeech, '/roboy/cognition/speech/recognition/recognize', self.asr_callback) + print("Ready to /roboy/cognition/speech/recognition/recognize") + print(f"Roboy Sonosco running with PID: {os.getpid()}") + self.i=IAsr() + print(f"Status: Speech recognition is ready now!") + print("Roboy Sonosco is ready!") + + def asr_callback(self, request, response): + response.success = True + self.get_logger().info('Incoming Audio') + msg = RecognizedSpeech() + self.i.inference_audio(request) + self.publisher.publish(msg) + return response + + +def main(args=None): + rclpy.init(args=args) + + stt = SonoscoROS2() + + while rclpy.ok(): + rclpy.spin_once(stt) + + rclpy.shutdown() + + +if __name__ == '__main__': + main() diff --git a/sonosco/config/infer.yaml b/sonosco/config/infer.yaml new file mode 100644 index 0000000..be61617 --- /dev/null +++ b/sonosco/config/infer.yaml @@ -0,0 +1,21 @@ +infer: + model_name: "" + audio_path: "" # Audio file to predict on + + sample_rate: 16000 # Sample rate + window_size: 0.02 # Window size for spectrogram in seconds + window_stride: 0.01 # Window stride for spectrogram in seconds + window: 'hamming' # Window type for spectrogram generation + + beam_decoder: False # Turn on beam decoder. otherwise - greedy + alpha: 0.8 + beam_width: 10 + beta: 1 + cutoff_prob: 1.0 + cutoff_top_n: 40 + lm_path: None # Path to a KenLM binary + lm_workers: 1 + offsets: False # Returns time offset information + top_paths: 1 + + cuda: True # Use cuda to run model \ No newline at end of file diff --git a/sonosco/config/test.yaml b/sonosco/config/test.yaml new file mode 100644 index 0000000..2589e15 --- /dev/null +++ b/sonosco/config/test.yaml @@ -0,0 +1,9 @@ +test: + test_manifest: "" # Path to test manifest csv + + batch_size: 32 # Batch size for testing + num_workers: 4 # Number of workers used in loading + verbose: True # Print out decoded output and error of each sample + save_output: Trur # Saves output of model from test + output_path: "" # Where to save raw acoustic output + diff --git a/sonosco/config/train.yaml b/sonosco/config/train.yaml new file mode 100644 index 0000000..25be72c --- /dev/null +++ b/sonosco/config/train.yaml @@ -0,0 +1,49 @@ +train: + train_manifest: 'examples/manifests/train_manifest.csv' + val_manifest: 'examples/manifests/val_manifest.csv' + labels_path: 'examples/labels.json' # Contains all characters for transcription + log_dir: 'logs' # Location for log files + def_dir: 'examples/checkpoints/', # Default location to save/load models + + load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune + + sample_rate: 16000 # Sample rate + window_size: 0.02 # Window size for spectrogram in seconds + window_stride: 0.01 # Window stride for spectrogram in seconds + window: 'hamming' # Window type for spectrogram generation + + batch_size: 32 # Batch size for training + hidden_size: 800 # Hidden size of RNNs + hidden_layers: 5 # Number of RNN layers + rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported + + max_epochs: 70 # Number of training epochs + learning_rate: 3e-4 # Initial learning rate + momentum: 0.9 # Momentum + max_norm: 800 # Norm cutoff to prevent explosion of gradients + learning_anneal: 1.1n # Annealing applied to learning rate every epoch + sortaGrad: True # Turn on ordering of dataset on sequence length for the first epoch + + checkpoint: True # Enables checkpoint saving of model + checkpoint_per_epoch: 1 # Save checkpoint per x epochs + silent: False # Turn on progress tracking per iteration + verbose: False # Turn on verbose progress tracking + continue: False # Continue training with a pre-trained model + finetune: False # Finetune a pre-trained model + + num_data_workers: 8 # Number of workers used in data-loading + augment: False # Use random tempo and gain perturbations + shuffle: True # Turn on shuffling and sample from dataset based on sequence length (smallest to largest) + + seed: 123456 # Seed to generators + cuda: True # Use cuda to train model + half_precision: Trues # Uses half precision to train a model + apex: True # Uses mixed precision to train a model + static_loss_scaling: False # Static loss scale for mixed precision + dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision + + dist_url: 'tcp://127.0.0.1:1550' # URL used to set up distributed training + dist_backend: 'nccl' # Distributed backend + world_size: 1 # Number of distributed processes + rank: 0 # The rank of the current process + gpu_rank: 0 # If using distributed parallel for multi_gpu, sets the GPU for the process \ No newline at end of file diff --git a/sonosco/decoders/__init__.py b/sonosco/decoders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/decoders/beam_decoder.py b/sonosco/decoders/beam_decoder.py new file mode 100644 index 0000000..c44d164 --- /dev/null +++ b/sonosco/decoders/beam_decoder.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# ---------------------------------------------------------------------------- +# Copyright 2015-2016 Nervana Systems Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- +# Modified to support pytorch Tensors +import torch + +from decoders.decoder import Decoder + + +class BeamCTCDecoder(Decoder): + def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, + num_processes=4, blank_index=0): + super(BeamCTCDecoder, self).__init__(labels) + try: + from ctcdecode import CTCBeamDecoder + except ImportError: + raise ImportError("BeamCTCDecoder requires paddledecoder package.") + self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, + num_processes, blank_index) + + def convert_to_strings(self, out, seq_len): + results = [] + for b, batch in enumerate(out): + utterances = [] + for p, utt in enumerate(batch): + size = seq_len[b][p] + if size > 0: + transcript = ''.join(map(lambda x: self.int_to_char[x.item()], utt[0:size])) + else: + transcript = '' + utterances.append(transcript) + results.append(utterances) + return results + + def convert_tensor(self, offsets, sizes): + results = [] + for b, batch in enumerate(offsets): + utterances = [] + for p, utt in enumerate(batch): + size = sizes[b][p] + if sizes[b][p] > 0: + utterances.append(utt[0:size]) + else: + utterances.append(torch.tensor([], dtype=torch.int)) + results.append(utterances) + return results + + def decode(self, probs, sizes=None): + """ + Decodes probability output using ctcdecode package. + Arguments: + probs: Tensor of character probabilities, where probs[c,t] + is the probability of character c at time t + sizes: Size of each sequence in the mini-batch + Returns: + string: sequences of the model's best guess for the transcription + """ + probs = probs.cpu() + out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes) + + strings = self.convert_to_strings(out, seq_lens) + offsets = self.convert_tensor(offsets, seq_lens) + return strings, offsets \ No newline at end of file diff --git a/sonosco/decoders/decoder.py b/sonosco/decoders/decoder.py new file mode 100644 index 0000000..99a8193 --- /dev/null +++ b/sonosco/decoders/decoder.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# ---------------------------------------------------------------------------- +# Copyright 2015-2016 Nervana Systems Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- +# Modified to support pytorch Tensors + +import Levenshtein as Lev + + +class Decoder(object): + """ + Basic decoder class from which all other decoders inherit. Implements several + helper functions. Subclasses should implement the decode() method. + + Arguments: + labels (string): mapping from integers to characters. + blank_index (int, optional): index for the blank '_' character. Defaults to 0. + space_index (int, optional): index for the space ' ' character. Defaults to 28. + """ + + def __init__(self, labels, blank_index=0): + # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#" + self.labels = labels + self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)]) + self.blank_index = blank_index + space_index = len(labels) # To prevent errors in decode, we add an out of bounds index for the space + if ' ' in labels: + space_index = labels.index(' ') + self.space_index = space_index + + def wer(self, s1, s2): + """ + Computes the Word Error Rate, defined as the edit distance between the + two provided sentences after tokenizing to words. + Arguments: + s1 (string): space-separated sentence + s2 (string): space-separated sentence + """ + + # build mapping of words to integers + b = set(s1.split() + s2.split()) + word2char = dict(zip(b, range(len(b)))) + + # map the words to a char array (Levenshtein packages only accepts + # strings) + w1 = [chr(word2char[w]) for w in s1.split()] + w2 = [chr(word2char[w]) for w in s2.split()] + + return Lev.distance(''.join(w1), ''.join(w2)) + + def cer(self, s1, s2): + """ + Computes the Character Error Rate, defined as the edit distance. + + Arguments: + s1 (string): space-separated sentence + s2 (string): space-separated sentence + """ + s1, s2, = s1.replace(' ', ''), s2.replace(' ', '') + return Lev.distance(s1, s2) + + def decode(self, probs, sizes=None): + """ + Given a matrix of character probabilities, returns the decoder's + best guess of the transcription + + Arguments: + probs: Tensor of character probabilities, where probs[c,t] + is the probability of character c at time t + sizes(optional): Size of each sequence in the mini-batch + Returns: + string: sequence of the model's best guess for the transcription + """ + raise NotImplementedError diff --git a/sonosco/decoders/greedy_decoder.py b/sonosco/decoders/greedy_decoder.py new file mode 100644 index 0000000..c14884f --- /dev/null +++ b/sonosco/decoders/greedy_decoder.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# ---------------------------------------------------------------------------- +# Copyright 2015-2016 Nervana Systems Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- +# Modified to support pytorch Tensors + +import torch + +from decoders.decoder import Decoder + + +class GreedyDecoder(Decoder): + def __init__(self, labels, blank_index=0): + super(GreedyDecoder, self).__init__(labels, blank_index) + + def convert_to_strings(self, sequences, sizes=None, remove_repetitions=False, return_offsets=False): + """Given a list of numeric sequences, returns the corresponding strings""" + strings = [] + offsets = [] if return_offsets else None + for x in range(len(sequences)): + seq_len = sizes[x] if sizes is not None else len(sequences[x]) + string, string_offsets = self.process_string(sequences[x], seq_len, remove_repetitions) + strings.append([string]) # We only return one path + if return_offsets: + offsets.append([string_offsets]) + if return_offsets: + return strings, offsets + else: + return strings + + def process_string(self, sequence, size, remove_repetitions=False): + string = '' + offsets = [] + for i in range(size): + char = self.int_to_char[sequence[i].item()] + if char != self.int_to_char[self.blank_index]: + # if this char is a repetition and remove_repetitions=true, then skip + if remove_repetitions and i != 0 and char == self.int_to_char[sequence[i - 1].item()]: + pass + elif char == self.labels[self.space_index]: + string += ' ' + offsets.append(i) + else: + string = string + char + offsets.append(i) + return string, torch.tensor(offsets, dtype=torch.int) + + def decode(self, probs, sizes=None): + """ + Returns the argmax decoding given the probability matrix. Removes + repeated elements in the sequence, as well as blanks. + + Arguments: + probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim + sizes(optional): Size of each sequence in the mini-batch + Returns: + strings: sequences of the model's best guess for the transcription on inputs + offsets: time step per character predicted + """ + _, max_probs = torch.max(probs, 2) + strings, offsets = self.convert_to_strings(max_probs.view(max_probs.size(0), max_probs.size(1)), sizes, + remove_repetitions=True, return_offsets=True) + return strings, offsets diff --git a/sonosco/infer.py b/sonosco/infer.py new file mode 100644 index 0000000..8fe03d2 --- /dev/null +++ b/sonosco/infer.py @@ -0,0 +1,24 @@ +import argparse +import os +import wave +from typing import Dict + +import yaml + +from modelwrapper import ModelWrapper + +parser = argparse.ArgumentParser(description='ASR inference') +parser.add_argument('--config', metavar='DIR', + help='Path to inference config file', default='config/infer.yaml') + +if __name__ == '__main__': + args = parser.parse_args() + with open(args.config, 'r') as file: + config = yaml.load(file) + config_dict: Dict = config["infer"] + model = ModelWrapper(**config_dict) + if "wave_path" in config_dict.keys() and os.path.isfile(config_dict.get("wave_path")): + sound = wave.open(config_dict.get("wave_path")) + print(model.infer(sound)) + else: + print("Wave file not found!") diff --git a/sonosco/loader.py b/sonosco/loader.py new file mode 100644 index 0000000..00b06d1 --- /dev/null +++ b/sonosco/loader.py @@ -0,0 +1,248 @@ +# ---------------------------------------------------------------------------- +# Based on SeanNaren's deepspeech.pytorch: +# https://github.com/SeanNaren/deepspeech.pytorch +# ---------------------------------------------------------------------------- + +import math +import warnings +from typing import Tuple + +import librosa +import numpy as np +import torch +import torchaudio +from scipy import signal +from torch.utils.data import Dataset, DataLoader, Sampler +from torch.distributed import get_rank +from torch.distributed import get_world_size + +windows = {"bartlett": torch.bartlett_window, + "blackman": torch.blackman_window, + "hamming": torch.hamming_window, + "hann": torch.hann_window} + +windows_legacy = {'hamming': signal.hamming, + 'hann': signal.hann, + 'blackman': signal.blackman, + 'bartlett': signal.bartlett} + + +class DataProcessor(object): + def __init__(self, audio_conf, labels="abc", normalize=False, augment=False, legacy=True): + """ + Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by + a comma. Each new line is a different sample. Example below: + + /path/to/audio.wav,/path/to/audio.txt + ... + + :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds + :param labels: String containing all the possible characters to map to + :param normalize: Apply standard mean and deviation normalization to audio tensor + :param augment(default False): Apply random tempo and gain perturbations + """ + self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) + self.window_stride = audio_conf["window_stride"] + self.window_size = audio_conf["window_size"] + self.sample_rate = audio_conf["sample_rate"] + self.window = windows_legacy.get(audio_conf["window"], windows_legacy["hamming"]) if legacy else windows.get(audio_conf["window"], windows["hamming"]) + self.normalize = normalize + self.augment = augment + self.legacy = legacy + self.transform = torchaudio.transforms.Spectrogram(n_fft=int(self.sample_rate * self.window_size), + hop=int(self.sample_rate * self.window_stride), + window=self.window, normalize=self.normalize) + + @staticmethod + def retrieve_file(audio_path, legacy=True): + sound, sample_rate = torchaudio.load(audio_path) + if legacy: + sound = sound.numpy().T + if len(sound.shape) > 1: + if sound.shape[1] == 1: + sound = sound.squeeze() + else: + sound = sound.mean(axis=1) + return sound, sample_rate + + @staticmethod + def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)): + """ + Changes tempo and gain of the wave + """ + warnings.warn("Augmentation is not implemented") # TODO: Implement + return sound + + def parse_audio(self, audio_path): + sound, sample_rate = self.retrieve_file(audio_path, self.legacy) + if sample_rate != self.sample_rate: + raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") + + if self.augment: + sound = self.augment_audio(sound) + + if self.legacy: + n_fft = int(self.sample_rate * self.window_size) + win_length = n_fft + hop_length = int(self.sample_rate * self.window_stride) + # STFT + D = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length, + win_length=win_length, window=self.window) + spectrogram, phase = librosa.magphase(D) + # S = log(S+1) + + spectrogram = torch.FloatTensor(np.log1p(spectrogram)) + else: + # TODO: Why these are different from librosa.stft? + sound = sound.cuda() + spectrogram = self.transform(sound)[-1, :, :].transpose(0, 1) + + # spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()), + # n_fft=int(self.sample_rate * self.window_size), + # hop_length=int(self.sample_rate * self.window_stride), + # win_length=int(self.sample_rate * self.window_size), + # window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1] + + if self.normalize: + mean = spectrogram.mean() + std = spectrogram.std() + spectrogram.add_(-mean) + spectrogram.div_(std) + + return spectrogram + + def parse_transcript(self, transcript_path): + with open(transcript_path, 'r', encoding='utf8') as transcript_file: + transcript = transcript_file.read().replace('\n', '') + # TODO: Is it fast enough? + transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) + return transcript + + +class AudioDataset(Dataset): + def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False, legacy=True): + """ + Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by + a comma. Each new line is a different sample. Example below: + + /path/to/audio.wav,/path/to/audio.txt + ... + + :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds + :param manifest_filepath: Path to manifest csv as describe above + :param labels: String containing all the possible characters to map to + :param normalize: Apply standard mean and deviation normalization to audio tensor + :param augment(default False): Apply random tempo and gain perturbations + """ + super(AudioDataset, self).__init__() + with open(manifest_filepath) as f: + ids = f.readlines() + ids = [x.strip().split(',') for x in ids] + self.ids = ids + self.size = len(ids) + self.processor = DataProcessor(audio_conf, labels, normalize, augment, legacy) + + def __getitem__(self, index): + sample = self.ids[index] + audio_path, transcript_path = sample[0], sample[1] + + spectrogram = self.processor.parse_audio(audio_path) + transcript = self.processor.parse_transcript(transcript_path) + + return spectrogram, transcript + + def __len__(self): + return self.size + + +# TODO: Optimise +def _collate_fn(batch): + batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) + longest_sample = batch[0][0] + freq_size, max_seqlength = longest_sample.size() + minibatch_size = len(batch) + inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) + input_percentages = torch.FloatTensor(minibatch_size) + target_sizes = np.zeros(minibatch_size, dtype=np.int32) + + # TODO: Numpy broadcasting magic + targets = [] + + for x in range(minibatch_size): + inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) + input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) + target_sizes[x] = len(batch[x][1]) + targets.extend(batch[x][1]) + + return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) + + +class AudioDataLoader(DataLoader): + def __init__(self, *args, **kwargs): + """ + Creates a data loader for AudioDatasets. + """ + super(AudioDataLoader, self).__init__(*args, **kwargs) + self.collate_fn = _collate_fn + + +class BucketingSampler(Sampler): + def __init__(self, data_source, batch_size=1): + """ + Samples batches assuming they are in order of size to batch similarly sized samples together. + """ + super(BucketingSampler, self).__init__(data_source) + self.data_source = data_source + ids = list(range(0, len(data_source))) + # TODO: Optimise + self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)] + + def __iter__(self): + for ids in self.bins: + np.random.shuffle(ids) + yield ids + + def __len__(self): + return len(self.bins) + + def shuffle(self, epoch): + np.random.shuffle(self.bins) + + +# TODO: Optimise +class DistributedBucketingSampler(Sampler): + def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None): + """ + Samples batches assuming they are in order of size to batch similarly sized samples together. + """ + super(DistributedBucketingSampler, self).__init__(data_source) + if num_replicas is None: + num_replicas = get_world_size() + if rank is None: + rank = get_rank() + self.data_source = data_source + self.ids = list(range(0, len(data_source))) + self.batch_size = batch_size + self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)] + self.num_replicas = num_replicas + self.rank = rank + self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + offset = self.rank + # add extra samples to make it evenly divisible + bins = self.bins + self.bins[:(self.total_size - len(self.bins))] + assert len(bins) == self.total_size + samples = bins[offset::self.num_replicas] # Get every Nth bin, starting from rank + return iter(samples) + + def __len__(self): + return self.num_samples + + def shuffle(self, epoch): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(epoch) + bin_ids = list(torch.randperm(len(self.bins), generator=g)) + self.bins = [self.bins[i] for i in bin_ids] diff --git a/sonosco/models/__init__.py b/sonosco/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py new file mode 100644 index 0000000..dcfa100 --- /dev/null +++ b/sonosco/models/deepspeech2.py @@ -0,0 +1,269 @@ +# ---------------------------------------------------------------------------- +# Based on SeanNaren's deepspeech.pytorch: +# https://github.com/SeanNaren/deepspeech.pytorch +# ---------------------------------------------------------------------------- + +import math +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +supported_rnns = { + 'lstm': nn.LSTM, + 'rnn': nn.RNN, + 'gru': nn.GRU +} +supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items()) + + +class SequenceWise(nn.Module): + def __init__(self, module): + """ + Collapses input of dim T*N*H to (T*N)*H, and applies to a module. + Allows handling of variable sequence lengths and minibatch sizes. + :param module: Module to apply input to. + """ + super(SequenceWise, self).__init__() + self.module = module + + def forward(self, x): + t, n = x.size(0), x.size(1) + x = x.view(t * n, -1) + x = self.module(x) + x = x.view(t, n, -1) + return x + + def __repr__(self): + tmpstr = self.__class__.__name__ + ' (\n' + tmpstr += self.module.__repr__() + tmpstr += ')' + return tmpstr + + +class MaskConv(nn.Module): + def __init__(self, seq_module): + """ + Adds padding to the output of the module based on the given lengths. This is to ensure that the + results of the model do not change when batch sizes change during inference. + Input needs to be in the shape of (BxCxDxT) + :param seq_module: The sequential module containing the conv stack. + """ + super(MaskConv, self).__init__() + self.seq_module = seq_module + + def forward(self, x, lengths): + """ + :param x: The input of size BxCxDxT + :param lengths: The actual length of each sequence in the batch + :return: Masked output from the module + """ + for module in self.seq_module: + x = module(x) + mask = torch.ByteTensor(x.size()).fill_(0) + if x.is_cuda: + mask = mask.cuda() + for i, length in enumerate(lengths): + length = length.item() + if (mask[i].size(2) - length) > 0: + mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1) + x = x.masked_fill(mask, 0) + return x, lengths + + +class InferenceBatchSoftmax(nn.Module): + def forward(self, input_): + if not self.training: + return F.softmax(input_, dim=-1) + else: + return input_ + + +class BatchRNN(nn.Module): + def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True): + super(BatchRNN, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None + self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, + bidirectional=True, bias=True) + + def flatten_parameters(self): + self.rnn.flatten_parameters() + + def forward(self, x, output_lengths): + if self.batch_norm is not None: + x = self.batch_norm(x) + x = nn.utils.rnn.pack_padded_sequence(x, output_lengths) + x, h = self.rnn(x) + x, _ = nn.utils.rnn.pad_packed_sequence(x) + if self.bidirectional: + x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum + return x + + +class DeepSpeech2(nn.Module): + def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5, audio_conf=None, + bidirectional=True): + super(DeepSpeech2, self).__init__() + + # model metadata needed for serialization/deserialization + if audio_conf is None: + audio_conf = {} + self.version = '0.0.1' + self.hidden_size = rnn_hid_size + self.hidden_layers = nb_layers + self.rnn_type = rnn_type + self.audio_conf = audio_conf or {} + self.labels = labels + self.bidirectional = bidirectional + # self.mixed_precision = mixed_precision + + sample_rate = self.audio_conf.get("sample_rate", 16000) + window_size = self.audio_conf.get("window_size", 0.02) + num_classes = len(self.labels) + + self.conv = MaskConv(nn.Sequential( + nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), + nn.BatchNorm2d(32), + nn.Hardtanh(0, 20, inplace=True), + nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), + nn.BatchNorm2d(32), + nn.Hardtanh(0, 20, inplace=True) + )) + # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 + rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1) + rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1) + rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1) + rnn_in_size *= 32 + + rnns = [('0', BatchRNN(input_size=rnn_in_size, hidden_size=rnn_hid_size, rnn_type=rnn_type, batch_norm=False))] + rnns.extend([(f"{x + 1}", BatchRNN(input_size=rnn_hid_size, hidden_size=rnn_hid_size, rnn_type=rnn_type)) + for x in range(nb_layers - 1)]) + self.rnns = nn.Sequential(OrderedDict(rnns)) + + fully_connected = nn.Sequential( + nn.BatchNorm1d(rnn_hid_size), + nn.Linear(rnn_hid_size, num_classes, bias=False) + ) + + self.fc = nn.Sequential( + SequenceWise(fully_connected), + ) + + self.inference_softmax = InferenceBatchSoftmax() + + def forward(self, x, lengths): + # if x.is_cuda and self.mixed_precision: + # x = x.half() + lengths = lengths.cpu().int() + output_lengths = self.get_seq_lens(lengths) + x, _ = self.conv(x, output_lengths) + + sizes = x.size() + x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension + x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH + + for rnn in self.rnns: + x = rnn(x, output_lengths) + + if not self.bidirectional: # no need for lookahead layer in bidirectional + x = self.lookahead(x) + + x = self.fc(x) + x = x.transpose(0, 1) + # identity in training mode, softmax in eval mode + x = self.inference_softmax(x) + return x, output_lengths + + def get_seq_lens(self, input_length): + """ + Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable + containing the size sequences that will be output by the network. + :param input_length: 1D Tensor + :return: 1D Tensor scaled by model + """ + seq_len = input_length + for m in self.conv.modules(): + if type(m) == nn.modules.conv.Conv2d: + seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1) + return seq_len.int() + + @classmethod + def load_model(cls, path): + package = torch.load(path, map_location=lambda storage, loc: storage) + model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], + labels=package['labels'], audio_conf=package['audio_conf'], + rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True)) + model.load_state_dict(package['state_dict']) + for x in model.rnns: + x.flatten_parameters() + + return model + + @classmethod + def load_model_package(cls, package): + model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], + labels=package['labels'], audio_conf=package['audio_conf'], + rnn_type=supported_rnns[package['rnn_type']], bidirectional=package.get('bidirectional', True)) + model.load_state_dict(package['state_dict']) + + return model + + @staticmethod + def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None, + cer_results=None, wer_results=None, avg_loss=None, meta=None): + package = { + 'version': model.version, + 'hidden_size': model.hidden_size, + 'hidden_layers': model.hidden_layers, + 'rnn_type': supported_rnns_inv.get(model.rnn_type, model.rnn_type.__name__.lower()), + 'audio_conf': model.audio_conf, + 'labels': model.labels, + 'state_dict': model.state_dict(), + 'bidirectional': model.bidirectional + } + if optimizer is not None: + package['optim_dict'] = optimizer.state_dict() + if avg_loss is not None: + package['avg_loss'] = avg_loss + if epoch is not None: + package['epoch'] = epoch + 1 # increment for readability + if iteration is not None: + package['iteration'] = iteration + if loss_results is not None: + package['loss_results'] = loss_results + package['cer_results'] = cer_results + package['wer_results'] = wer_results + if meta is not None: + package['meta'] = meta + return package + + @staticmethod + def get_param_size(model): + params = 0 + for p in model.parameters(): + tmp = 1 + for x in p.size(): + tmp *= x + params += tmp + return params + + def __repr__(self): + rep = f"DeepSpeech2 version: {self.version}\n" + \ + "=======================================" + \ + "Recurrent Neural Network Properties\n" + \ + f" RNN Type: \t{self.rnn_type.__name__.lower()}\n" + \ + f" RNN Layers:\t{self.hidden_layers}\n" + \ + f" RNN Size: \t{self.hidden_size}\n" + \ + f" Classes: \t{len(self.labels)}\n" + \ + "---------------------------------------\n" + \ + "Model Features\n" + \ + f" Labels: \t{self.labels}\n" + \ + f" Sample Rate: \t{self.audio_conf.get('sample_rate', 'n/a')}\n" + \ + f" Window Type: \t{self.audio_conf.get('window', 'n/a')}\n" + \ + f" Window Size: \t{self.audio_conf.get('window_size', 'n/a')}\n" + \ + f" Window Stride:\t{self.audio_conf.get('window_stride', 'n/a')}" + return rep diff --git a/sonosco/modelwrapper.py b/sonosco/modelwrapper.py new file mode 100644 index 0000000..b313603 --- /dev/null +++ b/sonosco/modelwrapper.py @@ -0,0 +1,413 @@ +import os.path +from random import random +from datetime import datetime + +import numpy as np +import torch + +from models.deepspeech2 import DeepSpeech2 + +import json +import os +import random +import time + +import torch.distributed as dist +import torch.utils.data.distributed + +try: + from apex.fp16_utils import FP16_Optimizer + from apex.parallel import DistributedDataParallel +except Exception as e: + print(f"Apex import failed: {e}") + +from tqdm import tqdm +from warpctc_pytorch import CTCLoss + +from loader import AudioDataLoader, AudioDataset, BucketingSampler, DistributedBucketingSampler +from decoders.greedy_decoder import GreedyDecoder +from utils import convert_model_to_half, reduce_tensor, check_loss + +models = {"deepspeech2": DeepSpeech2} + +sttime = datetime.now() +print(f"Time of start: {sttime}") + + +def to_np(x): + return x.cpu().numpy() + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +class ModelWrapper(object): + DEF_PATH = "examples/checkpoints/" + + def __init__(self, **kwargs): + self.model = kwargs.get("model", models["deepspeech2"]) + + if kwargs.get("continue"): + path = kwargs.get("from", ModelWrapper.get_default_path()) + self.model.package = torch.load(path, map_location=lambda storage, loc: storage) + self.model.load_model(path) + + self.save_path = kwargs.get("save", ModelWrapper.DEF_PATH + str(datetime.now().timestamp())) + + self.cuda = kwargs.get("cuda") + self.apex = kwargs.get("apex") if self.cuda else False + self.half = self.apex if self.apex else kwargs.get("half") + + def train(self, seed, cuda, mixed_precision, world_size, gpu_rank, rank, save_folder, dist_backend, dist_url, + epochs, continue_from, finetune, labels_path, sample_rate, window_size, window_stride, window, + hidden_size, hidden_layers, labels, supported_rnns, bidirectional, no_shuffle, no_sorta_grad, rnn_type, + train_manifest, augment, batch_size, num_workers, momentum, lr, static_loss_scale, dynamic_loss_scale, + val_manifest, max_norm, silent, checkpoint_per_batch, checkpoint, learning_anneal, model_path): + # Set seeds for determinism + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + + device = torch.device("cuda" if cuda else "cpu") + if mixed_precision and not cuda: + raise ValueError('If using mixed precision training, CUDA must be enabled!') + distributed = world_size > 1 + main_proc = True + device = torch.device("cuda" if cuda else "cpu") + if distributed: + if gpu_rank: + torch.cuda.set_device(int(gpu_rank)) + dist.init_process_group(backend=dist_backend, init_method=dist_url, + world_size=world_size, rank=rank) + main_proc = rank == 0 # Only the first proc should save models + save_folder = save_folder + os.makedirs(save_folder, exist_ok=True) # Ensure save folder exists + + loss_results, cer_results, wer_results = torch.Tensor(epochs), torch.Tensor(epochs), torch.Tensor(epochs) + best_wer = None + + avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None + if continue_from: # Starting from previous model + print("Loading checkpoint model %s" % continue_from) + + labels = self.model.labels + audio_conf = self.model.audio_conf + if not finetune: # Don't want to restart training + optim_state = self.model.package['optim_dict'] + start_epoch = int(self.model.get('epoch', 1)) - 1 # Index start at 0 for training + start_iter = self.model.package.get('iteration', None) + if start_iter is None: + start_epoch += 1 # We saved model after epoch finished, start at the next epoch. + start_iter = 0 + else: + start_iter += 1 + avg_loss = int(self.model.package.get('avg_loss', 0)) + loss_results, cer_results, wer_results = self.model.package['loss_results'], \ + self.model.package['cer_results'], \ + self.model.package['wer_results'] + else: + with open(labels_path) as label_file: + labels = str(''.join(json.load(label_file))) + + audio_conf = dict(sample_rate=sample_rate, + window_size=window_size, + window_stride=window_stride, + window=window) + + rnn_type = rnn_type.lower() + assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" + model = self.model(rnn_hidden_size=hidden_size, + nb_layers=hidden_layers, + labels=labels, + rnn_type=supported_rnns[rnn_type], + audio_conf=audio_conf, + bidirectional=bidirectional, + mixed_precision=mixed_precision) + + decoder = GreedyDecoder(labels) + + train_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels, + normalize=False, augment=augment) + test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels, + normalize=False, augment=False) + if not distributed: + train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) + else: + train_sampler = DistributedBucketingSampler(train_dataset, batch_size=batch_size, + num_replicas=world_size, rank=rank) + + train_loader = AudioDataLoader(train_dataset, num_workers=num_workers, batch_sampler=train_sampler) + test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers) + + if (not no_shuffle and start_epoch != 0) or no_sorta_grad: + print("Shuffling batches for the following epochs") + train_sampler.shuffle(start_epoch) + + model = model.to(device) + if mixed_precision: + model = convert_model_to_half(model) + parameters = model.parameters() + optimizer = torch.optim.SGD(parameters, lr=lr, + momentum=momentum, nesterov=True, weight_decay=1e-5) + if distributed: + model = DistributedDataParallel(model) + if mixed_precision: + optimizer = FP16_Optimizer(optimizer, + static_loss_scale=static_loss_scale, + dynamic_loss_scale=dynamic_loss_scale) + if optim_state is not None: + optimizer.load_state_dict(optim_state) + print(model) + print("Number of parameters: %d" % self.model.get_param_size(model)) + + criterion = CTCLoss() + batch_time = AverageMeter() + data_time = AverageMeter() + losses = AverageMeter() + + for epoch in range(start_epoch, epochs): + model.train() + end = time.time() + start_epoch_time = time.time() + for i, (data) in enumerate(train_loader, start=start_iter): + if i == len(train_sampler): + break + inputs, targets, input_percentages, target_sizes = data + input_sizes = input_percentages.mul_(int(inputs.size(3))).int() + # measure data loading time + data_time.update(time.time() - end) + inputs = inputs.to(device) + + out, output_sizes = model(inputs, input_sizes) + out = out.transpose(0, 1) # TxNxH + + float_out = out.float() # ensure float32 for loss + loss = criterion(float_out, targets, output_sizes, target_sizes).to(device) + loss = loss / inputs.size(0) # average the loss by minibatch + + if distributed: + loss = loss.to(device) + loss_value = reduce_tensor(loss, world_size).item() + else: + loss_value = loss.item() + + # Check to ensure valid loss was calculated + valid_loss, error = check_loss(loss, loss_value) + if valid_loss: + optimizer.zero_grad() + # compute gradient + if mixed_precision: + optimizer.backward(loss) + optimizer.clip_master_grads(max_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + optimizer.step() + else: + print(error) + print('Skipping grad update') + loss_value = 0 + + avg_loss += loss_value + losses.update(loss_value, inputs.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + if not silent: + print('Epoch: [{0}][{1}/{2}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( + (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=data_time, + loss=losses)) + if checkpoint_per_batch > 0 and i > 0 and (i + 1) % checkpoint_per_batch == 0 and main_proc: + file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (save_folder, epoch + 1, i + 1) + print("Saving checkpoint model to %s" % file_path) + torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i, + loss_results=loss_results, + wer_results=wer_results, cer_results=cer_results, + avg_loss=avg_loss), + file_path) + del loss, out, float_out + + avg_loss /= len(train_sampler) + + epoch_time = time.time() - start_epoch_time + print(f"Elapsed time from start: {datetime.now() - sttime}") + print('Training Summary Epoch: [{0}]\t' + 'Time taken (s): {epoch_time:.0f}\t' + 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss)) + + start_iter = 0 # Reset start iteration for next epoch + total_cer, total_wer = 0, 0 + model.eval() + with torch.no_grad(): + for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): + inputs, targets, input_percentages, target_sizes = data + input_sizes = input_percentages.mul_(int(inputs.size(3))).int() + inputs = inputs.to(device) + + # unflatten targets + split_targets = [] + offset = 0 + for size in target_sizes: + split_targets.append(targets[offset:offset + size]) + offset += size + + out, output_sizes = model(inputs, input_sizes) + + decoded_output, _ = decoder.decode(out, output_sizes) + target_strings = decoder.convert_to_strings(split_targets) + wer, cer = 0, 0 + for x in range(len(target_strings)): + transcript, reference = decoded_output[x][0], target_strings[x][0] + wer += decoder.wer(transcript, reference) / float(len(reference.split())) + cer += decoder.cer(transcript, reference) / float(len(reference)) + total_cer += cer + total_wer += wer + del out + wer = total_wer / len(test_loader.dataset) + cer = total_cer / len(test_loader.dataset) + wer *= 100 + cer *= 100 + loss_results[epoch] = avg_loss + wer_results[epoch] = wer + cer_results[epoch] = cer + print('Validation Summary Epoch: [{0}]\t' + 'Average WER {wer:.3f}\t' + 'Average CER {cer:.3f}\t'.format( + epoch + 1, wer=wer, cer=cer)) + + values = { + 'loss_results': loss_results, + 'cer_results': cer_results, + 'wer_results': wer_results + } + + if main_proc and checkpoint: + file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) + torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, + wer_results=wer_results, cer_results=cer_results), + file_path) + # anneal lr + param_groups = optimizer.optimizer.param_groups if mixed_precision else optimizer.param_groups + for g in param_groups: + g['lr'] = g['lr'] / learning_anneal + print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr'])) + + if main_proc and (best_wer is None or best_wer > wer): + print("Found better validated model, saving to %s" % model_path) + torch.save(self.model.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, + wer_results=wer_results, cer_results=cer_results), model_path) + best_wer = wer + + avg_loss = 0 + if not no_shuffle: + print("Shuffling batches...") + train_sampler.shuffle(epoch) + + def validate(self): + pass + + def test(self): + torch.set_grad_enabled(False) + device = torch.device("cuda" if cuda else "cpu") + model = load_model(device, model_path, cuda) + + if decoder == "beam": + from decoder import BeamCTCDecoder + + decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, alpha=alpha, beta=beta, + cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, + beam_width=beam_width, num_processes=lm_workers) + elif decoder == "greedy": + decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) + else: + decoder = None + target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) + test_dataset = AudioDataset(audio_conf=model.audio_conf, manifest_filepath=test_manifest, + labels=model.labels, normalize=True) + test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, + num_workers=num_workers) + total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 + output_data = [] + for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): + inputs, targets, input_percentages, target_sizes = data + input_sizes = input_percentages.mul_(int(inputs.size(3))).int() + inputs = inputs.to(device) + # unflatten targets + split_targets = [] + offset = 0 + for size in target_sizes: + split_targets.append(targets[offset:offset + size]) + offset += size + + out, output_sizes = model(inputs, input_sizes) + + if save_output: + # add output to data array, and continue + output_data.append((out.cpu().numpy(), output_sizes.numpy())) + + decoded_output, _ = decoder.decode(out, output_sizes) + target_strings = target_decoder.convert_to_strings(split_targets) + for x in range(len(target_strings)): + transcript, reference = decoded_output[x][0], target_strings[x][0] + wer_inst = decoder.wer(transcript, reference) + cer_inst = decoder.cer(transcript, reference) + total_wer += wer_inst + total_cer += cer_inst + num_tokens += len(reference.split()) + num_chars += len(reference) + if verbose: + print("Ref:", reference.lower()) + print("Hyp:", transcript.lower()) + print("WER:", float(wer_inst) / len(reference.split()), "CER:", float(cer_inst) / len(reference), + "\n") + + wer = float(total_wer) / num_tokens + cer = float(total_cer) / num_chars + + print('Test Summary \t' + 'Average WER {wer:.3f}\t' + 'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100)) + if save_output: + np.save(output_path, output_data) + + def infer(self, sound): + pass + + @staticmethod + def get_default_path(def_path: str) -> str: + """ + Returns the path to the latest checkpoint in the default location + :param def_path: default path where checkpoints are stored + :return: the path to the latest checkpoint + """ + latest_subdir = max([os.path.join(def_path, d) for d in os.listdir(def_path)], key=os.path.getmtime) + default = latest_subdir + "/final.pth" + return default + + def print_training_info(self, epoch, loss, cer, wer): + print(f"\nTraining Information\n " + \ + f"- Epoch:\t{epoch}\n " + \ + f"- Current Loss:\t{loss}\n " + \ + f"- Current CER: \t{cer}\n" + \ + f"- Current WER: \t{wer}") diff --git a/sonosco/test.py b/sonosco/test.py new file mode 100644 index 0000000..f4d8a39 --- /dev/null +++ b/sonosco/test.py @@ -0,0 +1,18 @@ +import argparse +from typing import Dict + +import yaml + +from modelwrapper import ModelWrapper + +parser = argparse.ArgumentParser(description='ASR testing') +parser.add_argument('--config', metavar='DIR', + help='Path to test config file', default='config/test.yaml') + +if __name__ == '__main__': + args = parser.parse_args() + with open(args.config, 'r') as file: + config = yaml.load(file) + config_dict: Dict = config["test"] + model = ModelWrapper(**config_dict) + model.test() diff --git a/sonosco/train.py b/sonosco/train.py new file mode 100644 index 0000000..a763a47 --- /dev/null +++ b/sonosco/train.py @@ -0,0 +1,18 @@ +import argparse +from typing import Dict + +import yaml + +from modelwrapper import ModelWrapper + +parser = argparse.ArgumentParser(description='ASR training') +parser.add_argument('--config', metavar='DIR', + help='Path to train config file', default='config/train.yaml') + +if __name__ == '__main__': + args = parser.parse_args() + with open(args.config, 'r') as file: + config = yaml.load(file) + config_dict: Dict = config["train"] + model = ModelWrapper(**config_dict) + model.train() diff --git a/sonosco/utils.py b/sonosco/utils.py new file mode 100644 index 0000000..5e219d1 --- /dev/null +++ b/sonosco/utils.py @@ -0,0 +1,51 @@ +import torch +from apex.fp16_utils import BN_convert_float +import torch.distributed as dist + +from models.deepspeech2 import DeepSpeech2 + + +def convert_model_to_half(model): + """ + Converts model to half but keeps the batch norm layers in 32 bit for precision purposes + """ + old_model = model + new_model = BN_convert_float(model.half()) + del old_model # Delete previous non-half model + return new_model + + +def reduce_tensor(tensor, world_size, reduce_op_max=False): + rt = tensor.clone() + dist.all_reduce(rt, op=dist.reduce_op.MAX if reduce_op_max is True else dist.reduce_op.SUM) # Default to sum + if not reduce_op_max: + rt /= world_size + return rt + + +def check_loss(loss, loss_value): + """ + Check that warp-ctc loss is valid and will not break training + :return: Return if loss is valid, and the error in case it is not + """ + loss_valid = True + error = '' + if loss_value == float("inf") or loss_value == float("-inf"): + loss_valid = False + error = "WARNING: received an inf loss" + elif torch.isnan(loss).sum() > 0: + loss_valid = False + error = 'WARNING: received a nan loss, setting loss value to 0' + elif loss_value < 0: + loss_valid = False + error = "WARNING: received a negative loss" + return loss_valid, error + + +def load_model(device, model_path, is_cuda): + model = DeepSpeech.load_model(model_path) + model.eval() + model = model.to(device) + if is_cuda and model.mixed_precision: + model = convert_model_to_half(model) + return model From ab254f587c2481503eed6e0dd25df6df012f997c Mon Sep 17 00:00:00 2001 From: ga38nif Date: Sat, 25 May 2019 17:32:20 +0200 Subject: [PATCH 04/58] init datasets folder, remove legacy code, split loader in multiple files --- sonosco/datasets/AudioDataLoader.py | 36 +++++ sonosco/datasets/AudioDataSampler.py | 68 +++++++++ sonosco/datasets/AudioDataset.py | 137 ++++++++++++++++++ sonosco/datasets/__init__.py | 0 .../datasets/download_datasets/__init__.py | 0 sonosco/datasets/test_datasets.py | 25 ++++ 6 files changed, 266 insertions(+) create mode 100644 sonosco/datasets/AudioDataLoader.py create mode 100644 sonosco/datasets/AudioDataSampler.py create mode 100644 sonosco/datasets/AudioDataset.py create mode 100644 sonosco/datasets/__init__.py create mode 100644 sonosco/datasets/download_datasets/__init__.py create mode 100644 sonosco/datasets/test_datasets.py diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py new file mode 100644 index 0000000..6c94f2c --- /dev/null +++ b/sonosco/datasets/AudioDataLoader.py @@ -0,0 +1,36 @@ + +import numpy as np +import torch + +from torch.utils.data import Dataset, DataLoader, Sampler + + +# TODO: Optimise +def _collate_fn(batch): + batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) + longest_sample = batch[0][0] + freq_size, max_seqlength = longest_sample.size() + minibatch_size = len(batch) + inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) + input_percentages = torch.FloatTensor(minibatch_size) + target_sizes = np.zeros(minibatch_size, dtype=np.int32) + + # TODO: Numpy broadcasting magic + targets = [] + + for x in range(minibatch_size): + inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) + input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) + target_sizes[x] = len(batch[x][1]) + targets.extend(batch[x][1]) + + return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) + + +class AudioDataLoader(DataLoader): + def __init__(self, *args, **kwargs): + """ + Creates a data loader for AudioDatasets. + """ + super(AudioDataLoader, self).__init__(*args, **kwargs) + self.collate_fn = _collate_fn \ No newline at end of file diff --git a/sonosco/datasets/AudioDataSampler.py b/sonosco/datasets/AudioDataSampler.py new file mode 100644 index 0000000..100bde7 --- /dev/null +++ b/sonosco/datasets/AudioDataSampler.py @@ -0,0 +1,68 @@ +import math + +import numpy as np +import torch +from torch.utils.data import Sampler +from torch.distributed.deprecated import get_rank +from torch.distributed.deprecated import get_world_size + +class BucketingSampler(Sampler): + def __init__(self, data_source, batch_size=1): + """ + Samples batches assuming they are in order of size to batch similarly sized samples together. + """ + super(BucketingSampler, self).__init__(data_source) + self.data_source = data_source + ids = list(range(0, len(data_source))) + # TODO: Optimise + self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)] + + def __iter__(self): + for ids in self.bins: + np.random.shuffle(ids) + yield ids + + def __len__(self): + return len(self.bins) + + def shuffle(self, epoch): + np.random.shuffle(self.bins) + + +# TODO: Optimise +class DistributedBucketingSampler(Sampler): + def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None): + """ + Samples batches assuming they are in order of size to batch similarly sized samples together. + """ + super(DistributedBucketingSampler, self).__init__(data_source) + if num_replicas is None: + num_replicas = get_world_size() + if rank is None: + rank = get_rank() + self.data_source = data_source + self.ids = list(range(0, len(data_source))) + self.batch_size = batch_size + self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)] + self.num_replicas = num_replicas + self.rank = rank + self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + offset = self.rank + # add extra samples to make it evenly divisible + bins = self.bins + self.bins[:(self.total_size - len(self.bins))] + assert len(bins) == self.total_size + samples = bins[offset::self.num_replicas] # Get every Nth bin, starting from rank + return iter(samples) + + def __len__(self): + return self.num_samples + + def shuffle(self, epoch): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(epoch) + bin_ids = list(torch.randperm(len(self.bins), generator=g)) + self.bins = [self.bins[i] for i in bin_ids] \ No newline at end of file diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py new file mode 100644 index 0000000..7639723 --- /dev/null +++ b/sonosco/datasets/AudioDataset.py @@ -0,0 +1,137 @@ +# ---------------------------------------------------------------------------- +# Based on SeanNaren's deepspeech.pytorch: +# https://github.com/SeanNaren/deepspeech.pytorch +# ---------------------------------------------------------------------------- + +import warnings +from typing import Tuple + +import torch +import torchaudio +from scipy import signal +from torch.utils.data import Dataset + +windows = {"bartlett": torch.bartlett_window, + "blackman": torch.blackman_window, + "hamming": torch.hamming_window, + "hann": torch.hann_window} + + +class DataProcessor(object): + def __init__(self, audio_conf, labels="abc", normalize=False, augment=False): + """ + Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by + a comma. Each new line is a different sample. Example below: + /path/to/audio.wav,/path/to/audio.txt + ... + :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds + :param labels: String containing all the possible characters to map to + :param normalize: Apply standard mean and deviation normalization to audio tensor + :param augment(default False): Apply random tempo and gain perturbations + """ + self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) + self.window_stride = audio_conf["window_stride"] + self.window_size = audio_conf["window_size"] + self.sample_rate = audio_conf["sample_rate"] + self.window = windows.get(audio_conf["window"], windows["hamming"]) + self.normalize = normalize + self.augment = augment + + @staticmethod + def retrieve_file(audio_path): + sound, sample_rate = torchaudio.load(audio_path) + return sound, sample_rate + + @staticmethod + def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)): + """ + Changes tempo and gain of the wave + """ + warnings.warn("Augmentation is not implemented") # TODO: Implement + return sound + + def parse_audio(self, audio_path): + sound, sample_rate = self.retrieve_file(audio_path) + if sample_rate != self.sample_rate: + raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") + + if self.augment: + sound = self.augment_audio(sound) + + #sound = sound.cuda() + spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()), + n_fft=int(self.sample_rate * self.window_size), + hop_length=int(self.sample_rate * self.window_stride), + win_length=int(self.sample_rate * self.window_size), + window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1] + + + + if self.normalize: + mean = spectrogram.mean() + std = spectrogram.std() + spectrogram.add_(-mean) + spectrogram.div_(std) + + return spectrogram + + def parse_transcript(self, transcript_path): + with open(transcript_path, 'r', encoding='utf8') as transcript_file: + transcript = transcript_file.read().replace('\n', '') + # TODO: Is it fast enough? + transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) + return transcript + + +class AudioDataset(Dataset): + def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False): + """ + Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by + a comma. Each new line is a different sample. Example below: + /path/to/audio.wav,/path/to/audio.txt + ... + :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds + :param manifest_filepath: Path to manifest csv as describe above + :param labels: String containing all the possible characters to map to + :param normalize: Apply standard mean and deviation normalization to audio tensor + :param augment(default False): Apply random tempo and gain perturbations + """ + super(AudioDataset, self).__init__() + with open(manifest_filepath) as f: + ids = f.readlines() + ids = [x.strip().split(',') for x in ids] + self.ids = ids + self.size = len(ids) + self.processor = DataProcessor(audio_conf, labels, normalize, augment) + + def __getitem__(self, index): + sample = self.ids[index] + audio_path, transcript_path = sample[0], sample[1] + + spectrogram = self.processor.parse_audio(audio_path) + transcript = self.processor.parse_transcript(transcript_path) + + return spectrogram, transcript + + def __len__(self): + return self.size + +def main(): + audio_conf = dict(sample_rate=16000, + window_size=.02, + window_stride=.01, + window='hamming') + test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv' + labels = 'abc' + test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels, + normalize=False, augment=False) + print("Dataset is created\n====================\n") + + test = test_dataset[0] + batch_size = 16 + sampler = BucketingSampler(test_dataset, batch_size=batch_size) + dataloader = DataLoader(dataset=test_dataset, num_workers=4, collate_fn=_collate_fn, batch_sampler=sampler) + test_dataset[0] + #inputs, targets, input_percentages, target_sizes = next(iter(dataloader)) +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sonosco/datasets/__init__.py b/sonosco/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/datasets/download_datasets/__init__.py b/sonosco/datasets/download_datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/datasets/test_datasets.py b/sonosco/datasets/test_datasets.py new file mode 100644 index 0000000..bc02b8d --- /dev/null +++ b/sonosco/datasets/test_datasets.py @@ -0,0 +1,25 @@ +from AudioDataLoader import AudioDataLoader +from AudioDataSampler import BucketingSampler, DistributedBucketingSampler +from AudioDataset import AudioDataset + + +def main(): + audio_conf = dict(sample_rate=16000, + window_size=.02, + window_stride=.01, + window='hamming') + test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv' + labels = 'abc' + test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels, + normalize=False, augment=False) + print("Dataset is created\n====================\n") + + test = test_dataset[0] + batch_size = 16 + sampler = BucketingSampler(test_dataset, batch_size=batch_size) + dataloader = AudioDataLoader(test_dataset,num_workers=4, batch_sampler=sampler) + + inputs, targets, input_percentages, target_sizes = next(iter(dataloader)) + print(targets) +if __name__ == "__main__": + main() \ No newline at end of file From dcaa86c03c6a00c165f82eae8d900aa63b438beb Mon Sep 17 00:00:00 2001 From: ga38nif Date: Sat, 25 May 2019 17:58:06 +0200 Subject: [PATCH 05/58] rename testscript --- sonosco/datasets/AudioDataset.py | 4 +++- .../datasets/{test_datasets.py => datasets_test_script.py} | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) rename sonosco/datasets/{test_datasets.py => datasets_test_script.py} (83%) diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py index 7639723..0a95284 100644 --- a/sonosco/datasets/AudioDataset.py +++ b/sonosco/datasets/AudioDataset.py @@ -78,8 +78,10 @@ def parse_audio(self, audio_path): def parse_transcript(self, transcript_path): with open(transcript_path, 'r', encoding='utf8') as transcript_file: transcript = transcript_file.read().replace('\n', '') + print(f"1: {transcript}") # TODO: Is it fast enough? transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) + print(f"transcript_path: {transcript_path}\ntranscript: {transcript}") return transcript @@ -122,7 +124,7 @@ def main(): window_stride=.01, window='hamming') test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv' - labels = 'abc' + labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels, normalize=False, augment=False) print("Dataset is created\n====================\n") diff --git a/sonosco/datasets/test_datasets.py b/sonosco/datasets/datasets_test_script.py similarity index 83% rename from sonosco/datasets/test_datasets.py rename to sonosco/datasets/datasets_test_script.py index bc02b8d..7f8a11f 100644 --- a/sonosco/datasets/test_datasets.py +++ b/sonosco/datasets/datasets_test_script.py @@ -4,6 +4,10 @@ def main(): + labels_path = "/Users/florianlay/roboy/sonosco/sonosco/datasets/labels.json" + with open(labels_path) as label_file: + labels = str(''.join(json.load(label_file))) + audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, @@ -20,6 +24,6 @@ def main(): dataloader = AudioDataLoader(test_dataset,num_workers=4, batch_sampler=sampler) inputs, targets, input_percentages, target_sizes = next(iter(dataloader)) - print(targets) + print(test) if __name__ == "__main__": main() \ No newline at end of file From 02eb538f8befc825c6226dd857d91c18a1fbbc3b Mon Sep 17 00:00:00 2001 From: ga38nif Date: Sun, 26 May 2019 23:38:53 +0200 Subject: [PATCH 06/58] move collate fn in class --- sonosco/datasets/AudioDataLoader.py | 46 ++++++++++++++--------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py index 6c94f2c..3161abd 100644 --- a/sonosco/datasets/AudioDataLoader.py +++ b/sonosco/datasets/AudioDataLoader.py @@ -4,33 +4,31 @@ from torch.utils.data import Dataset, DataLoader, Sampler - -# TODO: Optimise -def _collate_fn(batch): - batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) - longest_sample = batch[0][0] - freq_size, max_seqlength = longest_sample.size() - minibatch_size = len(batch) - inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) - input_percentages = torch.FloatTensor(minibatch_size) - target_sizes = np.zeros(minibatch_size, dtype=np.int32) - - # TODO: Numpy broadcasting magic - targets = [] - - for x in range(minibatch_size): - inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) - input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) - target_sizes[x] = len(batch[x][1]) - targets.extend(batch[x][1]) - - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) - - class AudioDataLoader(DataLoader): def __init__(self, *args, **kwargs): """ Creates a data loader for AudioDatasets. """ super(AudioDataLoader, self).__init__(*args, **kwargs) - self.collate_fn = _collate_fn \ No newline at end of file + self.collate_fn = self._collate_fn + +# TODO: Optimise + def _collate_fn(batch): + batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) + longest_sample = batch[0][0] + freq_size, max_seqlength = longest_sample.size() + minibatch_size = len(batch) + inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) + input_percentages = torch.FloatTensor(minibatch_size) + target_sizes = np.zeros(minibatch_size, dtype=np.int32) + + # TODO: Numpy broadcasting magic + targets = [] + + for x in range(minibatch_size): + inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) + input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) + target_sizes[x] = len(batch[x][1]) + targets.extend(batch[x][1]) + + return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) \ No newline at end of file From 08c7cdaac9d5e2837c976ac21012018ffe25861c Mon Sep 17 00:00:00 2001 From: ga38nif Date: Sun, 26 May 2019 23:51:39 +0200 Subject: [PATCH 07/58] add scripts from deepspeech to download datasets --- sonosco/datasets/download_datasets/an4.py | 87 +++++++++++++ .../download_datasets/common_voice.py | 90 +++++++++++++ .../datasets/download_datasets/librispeech.py | 113 ++++++++++++++++ sonosco/datasets/download_datasets/ted3.py | 123 ++++++++++++++++++ .../datasets/download_datasets/voxforge.py | 102 +++++++++++++++ 5 files changed, 515 insertions(+) create mode 100644 sonosco/datasets/download_datasets/an4.py create mode 100644 sonosco/datasets/download_datasets/common_voice.py create mode 100644 sonosco/datasets/download_datasets/librispeech.py create mode 100644 sonosco/datasets/download_datasets/ted3.py create mode 100644 sonosco/datasets/download_datasets/voxforge.py diff --git a/sonosco/datasets/download_datasets/an4.py b/sonosco/datasets/download_datasets/an4.py new file mode 100644 index 0000000..f810ee0 --- /dev/null +++ b/sonosco/datasets/download_datasets/an4.py @@ -0,0 +1,87 @@ +import argparse +import os +import io +import shutil +import tarfile +import wget + +from utils import create_manifest + +parser = argparse.ArgumentParser(description='Processes and downloads an4.') +parser.add_argument('--target-dir', default='an4_dataset/', help='Path to save dataset') +parser.add_argument('--min-duration', default=1, type=int, + help='Prunes training samples shorter than the min duration (given in seconds, default 1)') +parser.add_argument('--max-duration', default=15, type=int, + help='Prunes training samples longer than the max duration (given in seconds, default 15)') +args = parser.parse_args() + + +def _format_data(root_path, data_tag, name, wav_folder): + data_path = args.target_dir + data_tag + '/' + name + '/' + new_transcript_path = data_path + '/txt/' + new_wav_path = data_path + '/wav/' + + os.makedirs(new_transcript_path) + os.makedirs(new_wav_path) + + wav_path = root_path + 'wav/' + file_ids = root_path + 'etc/an4_%s.fileids' % data_tag + transcripts = root_path + 'etc/an4_%s.transcription' % data_tag + train_path = wav_path + wav_folder + + _convert_audio_to_wav(train_path) + _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path) + + +def _convert_audio_to_wav(train_path): + with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe: + for line in pipe: + raw_path = line.strip() + new_path = line.replace('.raw', '.wav').strip() + cmd = 'sox -t raw -r %d -b 16 -e signed-integer -B -c 1 \"%s\" \"%s\"' % ( + 16000, raw_path, new_path) + os.system(cmd) + + +def _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path): + with open(file_ids, 'r') as f: + with open(transcripts, 'r') as t: + paths = f.readlines() + transcripts = t.readlines() + for x in range(len(paths)): + path = wav_path + paths[x].strip() + '.wav' + filename = path.split('/')[-1] + extracted_transcript = _process_transcript(transcripts, x) + current_path = os.path.abspath(path) + new_path = new_wav_path + filename + text_path = new_transcript_path + filename.replace('.wav', '.txt') + with io.FileIO(text_path, "w") as file: + file.write(extracted_transcript.encode('utf-8')) + os.rename(current_path, new_path) + + +def _process_transcript(transcripts, x): + extracted_transcript = transcripts[x].split('(')[0].strip("").split('<')[0].strip().upper() + return extracted_transcript + + +def main(): + root_path = 'an4/' + name = 'an4' + wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz') + tar = tarfile.open('an4_raw.bigendian.tar.gz') + tar.extractall() + os.makedirs(args.target_dir) + _format_data(root_path, 'train', name, 'an4_clstk') + _format_data(root_path, 'test', name, 'an4test_clstk') + shutil.rmtree(root_path) + os.remove('an4_raw.bigendian.tar.gz') + train_path = args.target_dir + '/train/' + test_path = args.target_dir + '/test/' + print ('\n', 'Creating manifests...') + create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration) + create_manifest(test_path, 'an4_val_manifest.csv') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py new file mode 100644 index 0000000..fbc7b91 --- /dev/null +++ b/sonosco/datasets/download_datasets/common_voice.py @@ -0,0 +1,90 @@ +import os +import wget +import tarfile +import argparse +import csv +from multiprocessing.pool import ThreadPool +import subprocess +from utils import create_manifest + +parser = argparse.ArgumentParser(description='Downloads and processes Mozilla Common Voice dataset.') +parser.add_argument("--target-dir", default='CommonVoice_dataset/', type=str, help="Directory to store the dataset.") +parser.add_argument("--tar-path", type=str, help="Path to the Common Voice *.tar file if downloaded (Optional).") +parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate') +parser.add_argument('--min-duration', default=1, type=int, + help='Prunes training samples shorter than the min duration (given in seconds, default 1)') +parser.add_argument('--max-duration', default=15, type=int, + help='Prunes training samples longer than the max duration (given in seconds, default 15)') +parser.add_argument('--files-to-process', default="cv-valid-dev.csv,cv-valid-test.csv,cv-valid-train.csv", + type=str, help='list of *.csv file names to process') +args = parser.parse_args() +COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz" + + +def convert_to_wav(csv_file, target_dir): + """ Read *.csv file description, convert mp3 to wav, process text. + Save results to target_dir. + Args: + csv_file: str, path to *.csv file with data description, usually start from 'cv-' + target_dir: str, path to dir to save results; wav/ and txt/ dirs will be created + """ + wav_dir = os.path.join(target_dir, 'wav/') + txt_dir = os.path.join(target_dir, 'txt/') + os.makedirs(wav_dir, exist_ok=True) + os.makedirs(txt_dir, exist_ok=True) + path_to_data = os.path.dirname(csv_file) + + def process(x): + file_path, text = x + file_name = os.path.splitext(os.path.basename(file_path))[0] + text = text.strip().upper() + with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f: + f.write(text) + cmd = "sox {} -r {} -b 16 -c 1 {}".format( + os.path.join(path_to_data, file_path), + args.sample_rate, + os.path.join(wav_dir, file_name + '.wav')) + subprocess.call([cmd], shell=True) + + print('Converting mp3 to wav for {}.'.format(csv_file)) + with open(csv_file) as csvfile: + reader = csv.DictReader(csvfile) + data = [(row['filename'], row['text']) for row in reader] + with ThreadPool(10) as pool: + pool.map(process, data) + + +def main(): + target_dir = args.target_dir + os.makedirs(target_dir, exist_ok=True) + + target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") + os.makedirs(target_unpacked_dir, exist_ok=True) + + if args.tar_path and os.path.exists(args.tar_path): + print('Find existing file {}'.format(args.tar_path)) + target_file = args.tar_path + else: + print("Could not find downloaded Common Voice archive, Downloading corpus...") + filename = wget.download(COMMON_VOICE_URL, target_dir) + target_file = os.path.join(target_dir, os.path.basename(filename)) + + print("Unpacking corpus to {} ...".format(target_unpacked_dir)) + tar = tarfile.open(target_file) + tar.extractall(target_unpacked_dir) + tar.close() + + for csv_file in args.files_to_process.split(','): + convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file), + os.path.join(target_dir, os.path.splitext(csv_file)[0])) + + print('Creating manifests...') + for csv_file in args.files_to_process.split(','): + create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]), + os.path.splitext(csv_file)[0] + '_manifest.csv', + args.min_duration, + args.max_duration) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py new file mode 100644 index 0000000..cc618a1 --- /dev/null +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -0,0 +1,113 @@ +import os +import wget +import tarfile +import argparse +import subprocess +from utils import create_manifest +from tqdm import tqdm +import shutil + +parser = argparse.ArgumentParser(description='Processes and downloads LibriSpeech dataset.') +parser.add_argument("--target-dir", default='LibriSpeech_dataset/', type=str, help="Directory to store the dataset.") +parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate') +parser.add_argument('--files-to-use', default="train-clean-100.tar.gz," + "train-clean-360.tar.gz,train-other-500.tar.gz," + "dev-clean.tar.gz,dev-other.tar.gz," + "test-clean.tar.gz,test-other.tar.gz", type=str, + help='list of file names to download') +parser.add_argument('--min-duration', default=1, type=int, + help='Prunes training samples shorter than the min duration (given in seconds, default 1)') +parser.add_argument('--max-duration', default=15, type=int, + help='Prunes training samples longer than the max duration (given in seconds, default 15)') +args = parser.parse_args() + +LIBRI_SPEECH_URLS = { + "train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz", + "http://www.openslr.org/resources/12/train-clean-360.tar.gz", + "http://www.openslr.org/resources/12/train-other-500.tar.gz"], + + "val": ["http://www.openslr.org/resources/12/dev-clean.tar.gz", + "http://www.openslr.org/resources/12/dev-other.tar.gz"], + + "test_clean": ["http://www.openslr.org/resources/12/test-clean.tar.gz"], + "test_other": ["http://www.openslr.org/resources/12/test-other.tar.gz"] +} + + +def _preprocess_transcript(phrase): + return phrase.strip().upper() + + +def _process_file(wav_dir, txt_dir, base_filename, root_dir): + full_recording_path = os.path.join(root_dir, base_filename) + assert os.path.exists(full_recording_path) and os.path.exists(root_dir) + wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav")) + subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate), + wav_recording_path)], shell=True) + # process transcript + txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt")) + transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt") + assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file) + transcriptions = open(transcript_file).read().strip().split("\n") + transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions} + with open(txt_transcript_path, "w") as f: + key = base_filename.replace(".flac", "").split("-")[-1] + assert key in transcriptions, "{} is not in the transcriptions".format(key) + f.write(_preprocess_transcript(transcriptions[key])) + f.flush() + + +def main(): + target_dl_dir = args.target_dir + if not os.path.exists(target_dl_dir): + os.makedirs(target_dl_dir) + files_to_dl = args.files_to_use.strip().split(',') + for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): + split_dir = os.path.join(target_dl_dir, split_type) + if not os.path.exists(split_dir): + os.makedirs(split_dir) + split_wav_dir = os.path.join(split_dir, "wav") + if not os.path.exists(split_wav_dir): + os.makedirs(split_wav_dir) + split_txt_dir = os.path.join(split_dir, "txt") + if not os.path.exists(split_txt_dir): + os.makedirs(split_txt_dir) + extracted_dir = os.path.join(split_dir, "LibriSpeech") + if os.path.exists(extracted_dir): + shutil.rmtree(extracted_dir) + for url in lst_libri_urls: + # check if we want to dl this file + dl_flag = False + for f in files_to_dl: + if url.find(f) != -1: + dl_flag = True + if not dl_flag: + print("Skipping url: {}".format(url)) + continue + filename = url.split("/")[-1] + target_filename = os.path.join(split_dir, filename) + if not os.path.exists(target_filename): + wget.download(url, split_dir) + print("Unpacking {}...".format(filename)) + tar = tarfile.open(target_filename) + tar.extractall(split_dir) + tar.close() + os.remove(target_filename) + print("Converting flac files to wav and extracting transcripts...") + assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename) + for root, subdirs, files in tqdm(os.walk(extracted_dir)): + for f in files: + if f.find(".flac") != -1: + _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, + base_filename=f, root_dir=root) + + print("Finished {}".format(url)) + shutil.rmtree(extracted_dir) + if split_type == 'train': # Prune to min/max duration + create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration) + else: + create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv') + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/sonosco/datasets/download_datasets/ted3.py b/sonosco/datasets/download_datasets/ted3.py new file mode 100644 index 0000000..c7d4b3e --- /dev/null +++ b/sonosco/datasets/download_datasets/ted3.py @@ -0,0 +1,123 @@ +import os +import wget +import tarfile +import argparse +import subprocess +import unicodedata +import io +from utils import create_manifest +from tqdm import tqdm + +parser = argparse.ArgumentParser(description='Processes and downloads TED-LIUMv3 dataset.') +parser.add_argument("--target-dir", default='TEDLIUM3_dataset/', type=str, help="Directory to store the dataset.") +parser.add_argument("--tar-path", type=str, help="Path to the TEDLIUM_release tar if downloaded (Optional).") +parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate') +parser.add_argument('--min-duration', default=1, type=int, + help='Prunes training samples shorter than the min duration (given in seconds, default 1)') +parser.add_argument('--max-duration', default=15, type=int, + help='Prunes training samples longer than the max duration (given in seconds, default 15)') +args = parser.parse_args() + +TED_LIUM_V2_DL_URL = "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz" + + +def get_utterances_from_stm(stm_file): + """ + Return list of entries containing phrase and its start/end timings + :param stm_file: + :return: + """ + res = [] + with io.open(stm_file, "r", encoding='utf-8') as f: + for stm_line in f: + tokens = stm_line.split() + start_time = float(tokens[3]) + end_time = float(tokens[4]) + filename = tokens[0] + transcript = unicodedata.normalize("NFKD", + " ".join(t for t in tokens[6:]).strip()). \ + encode("utf-8", "ignore").decode("utf-8", "ignore") + if transcript != "ignore_time_segment_in_scoring": + res.append({ + "start_time": start_time, "end_time": end_time, + "filename": filename, "transcript": transcript + }) + return res + + +def cut_utterance(src_sph_file, target_wav_file, start_time, end_time, sample_rate=16000): + subprocess.call(["sox {} -r {} -b 16 -c 1 {} trim {} ={}".format(src_sph_file, str(sample_rate), + target_wav_file, start_time, end_time)], + shell=True) + + +def _preprocess_transcript(phrase): + return phrase.strip().upper() + + +def filter_short_utterances(utterance_info, min_len_sec=1.0): + return utterance_info["end_time"] - utterance_info["start_time"] > min_len_sec + + +def prepare_dir(ted_dir): + converted_dir = os.path.join(ted_dir, "converted") + # directories to store converted wav files and their transcriptions + wav_dir = os.path.join(converted_dir, "wav") + if not os.path.exists(wav_dir): + os.makedirs(wav_dir) + txt_dir = os.path.join(converted_dir, "txt") + if not os.path.exists(txt_dir): + os.makedirs(txt_dir) + counter = 0 + entries = os.listdir(os.path.join(ted_dir, "sph")) + for sph_file in tqdm(entries, total=len(entries)): + speaker_name = sph_file.split('.sph')[0] + + sph_file_full = os.path.join(ted_dir, "sph", sph_file) + stm_file_full = os.path.join(ted_dir, "stm", "{}.stm".format(speaker_name)) + + assert os.path.exists(sph_file_full) and os.path.exists(stm_file_full) + all_utterances = get_utterances_from_stm(stm_file_full) + + all_utterances = filter(filter_short_utterances, all_utterances) + for utterance_id, utterance in enumerate(all_utterances): + target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(utterance["filename"], str(utterance_id))) + target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(utterance["filename"], str(utterance_id))) + cut_utterance(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"], + sample_rate=args.sample_rate) + with io.FileIO(target_txt_file, "w") as f: + f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8')) + counter += 1 + + +def main(): + target_dl_dir = args.target_dir + #if not os.path.exists(target_dl_dir): + # os.makedirs(target_dl_dir) + + target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release-3") + #if args.tar_path and os.path.exists(args.tar_path): + # target_file = args.tar_path + #else: + # print("Could not find downloaded TEDLIUM archive, Downloading corpus...") + # wget.download(TED_LIUM_V2_DL_URL, target_dl_dir) + # target_file = os.path.join(target_dl_dir, "TEDLIUM_release-3.tgz") + #if not os.path.exists(target_unpacked_dir): + # print("Unpacking corpus...") + # tar = tarfile.open(target_file) + # tar.extractall(target_dl_dir) + # tar.close() + #else: + # print("Found TEDLIUM directory, skipping unpacking of tar files") + + train_ted_dir = os.path.join(target_unpacked_dir, "data") + train_ted_dir = os.path.join(train_ted_dir, "converted") + + #prepare_dir(train_ted_dir) + print('Creating manifests...') + + create_manifest(train_ted_dir, 'ted3_train_manifest.csv', args.min_duration, args.max_duration) + + +if __name__ == "__main__": + main() diff --git a/sonosco/datasets/download_datasets/voxforge.py b/sonosco/datasets/download_datasets/voxforge.py new file mode 100644 index 0000000..a31febf --- /dev/null +++ b/sonosco/datasets/download_datasets/voxforge.py @@ -0,0 +1,102 @@ +import os +from six.moves import urllib +import argparse +import re +import tempfile +import shutil +import subprocess +import tarfile +import io +from tqdm import tqdm + +from utils import create_manifest + +VOXFORGE_URL_16kHz = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/' + +parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.') +parser.add_argument("--target-dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.") +parser.add_argument('--sample-rate', default=16000, + type=int, help='Sample rate') +parser.add_argument('--min-duration', default=1, type=int, + help='Prunes training samples shorter than the min duration (given in seconds, default 1)') +parser.add_argument('--max-duration', default=15, type=int, + help='Prunes training samples longer than the max duration (given in seconds, default 15)') +args = parser.parse_args() + + +def _get_recordings_dir(sample_dir, recording_name): + wav_dir = os.path.join(sample_dir, recording_name, "wav") + if os.path.exists(wav_dir): + return "wav", wav_dir + flac_dir = os.path.join(sample_dir, recording_name, "flac") + if os.path.exists(flac_dir): + return "flac", flac_dir + raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name)) + + +def prepare_sample(recording_name, url, target_folder): + """ + Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder. + """ + wav_dir = os.path.join(target_folder, "wav") + if not os.path.exists(wav_dir): + os.makedirs(wav_dir) + txt_dir = os.path.join(target_folder, "txt") + if not os.path.exists(txt_dir): + os.makedirs(txt_dir) + # check if sample is processed + filename_set = set(['_'.join(wav_file.split('_')[:-1]) for wav_file in os.listdir(wav_dir)]) + if recording_name in filename_set: + return + + request = urllib.request.Request(url) + response = urllib.request.urlopen(request) + content = response.read() + response.close() + with tempfile.NamedTemporaryFile(suffix=".tgz", mode='wb') as target_tgz: + target_tgz.write(content) + target_tgz.flush() + dirpath = tempfile.mkdtemp() + + tar = tarfile.open(target_tgz.name) + tar.extractall(dirpath) + tar.close() + + recordings_type, recordings_dir = _get_recordings_dir(dirpath, recording_name) + tgz_prompt_file = os.path.join(dirpath, recording_name, "etc", "PROMPTS") + + if os.path.exists(recordings_dir) and os.path.exists(tgz_prompt_file): + transcriptions = open(tgz_prompt_file).read().strip().split("\n") + transcriptions = {t.split()[0]: " ".join(t.split()[1:]) for t in transcriptions} + for wav_file in os.listdir(recordings_dir): + recording_id = wav_file.split('.{}'.format(recordings_type))[0] + transcription_key = recording_name + "/mfc/" + recording_id + if transcription_key not in transcriptions: + continue + utterance = transcriptions[transcription_key] + + target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id)) + target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id)) + with io.FileIO(target_txt_file, "w") as file: + file.write(utterance.encode('utf-8')) + original_wav_file = os.path.join(recordings_dir, wav_file) + subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate), + target_wav_file)], shell=True) + + shutil.rmtree(dirpath) + + +if __name__ == '__main__': + target_dir = args.target_dir + sample_rate = args.sample_rate + + if not os.path.isdir(target_dir): + os.makedirs(target_dir) + request = urllib.request.Request(VOXFORGE_URL_16kHz) + response = urllib.request.urlopen(request) + content = response.read() + all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8")) + for f in tqdm(all_files, total=len(all_files)): + prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir) + print('Creating manifests...') + create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration) \ No newline at end of file From 31bc8125a7de4df0947b1a9b3870925f521d3c79 Mon Sep 17 00:00:00 2001 From: ga38nif Date: Mon, 27 May 2019 00:27:51 +0200 Subject: [PATCH 08/58] add wget to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c7d87ee..19ce43b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ torchaudio==0.2 torchvision==0.3.0 tqdm==4.32.1 pyyaml==5.1 +wget==3.2 -e git+git@github.com:SeanNaren/warp-ctc.git@ab27454d0cf3f936a6e19165682fe2b759f3f8e5#egg=warpctc_pytorch&subdirectory=pytorch_binding From fcae804043b880cf6aac39ebe73978456f3f242e Mon Sep 17 00:00:00 2001 From: ga38nif Date: Mon, 27 May 2019 00:28:30 +0200 Subject: [PATCH 09/58] modify librispeech script to save data to .temp --- .../datasets/download_datasets/librispeech.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index cc618a1..b9c8fc4 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -3,7 +3,7 @@ import tarfile import argparse import subprocess -from utils import create_manifest +from data_utils import create_manifest from tqdm import tqdm import shutil @@ -22,15 +22,15 @@ args = parser.parse_args() LIBRI_SPEECH_URLS = { - "train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz", - "http://www.openslr.org/resources/12/train-clean-360.tar.gz", - "http://www.openslr.org/resources/12/train-other-500.tar.gz"], + #"train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz", + # "http://www.openslr.org/resources/12/train-clean-360.tar.gz", + # "http://www.openslr.org/resources/12/train-other-500.tar.gz"], - "val": ["http://www.openslr.org/resources/12/dev-clean.tar.gz", - "http://www.openslr.org/resources/12/dev-other.tar.gz"], + #"val": ["http://www.openslr.org/resources/12/dev-clean.tar.gz", + # "http://www.openslr.org/resources/12/dev-other.tar.gz"], - "test_clean": ["http://www.openslr.org/resources/12/test-clean.tar.gz"], - "test_other": ["http://www.openslr.org/resources/12/test-other.tar.gz"] + "test_clean": ["http://www.openslr.org/resources/12/test-clean.tar.gz"]#, + #"test_other": ["http://www.openslr.org/resources/12/test-other.tar.gz"] } @@ -58,12 +58,24 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir): def main(): - target_dl_dir = args.target_dir - if not os.path.exists(target_dl_dir): - os.makedirs(target_dl_dir) - files_to_dl = args.files_to_use.strip().split(',') + root = os.path.expanduser('~') + data_path = '.temp/data/libri' + + filenames = [ + 'train-clean-100.tar.gz', + 'train-clean-360.tar.gz', + 'train-other-500.tar.gz', + 'dev-clean.tar.gz', + 'dev-other.tar.gz', + 'test-clean.tar.gz', + 'test-other.tar.gz' + ] + path_to_data = os.path.join(root, data_path) + if not os.path.exists(path_to_data): + os.makedirs(path_to_data) + for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): - split_dir = os.path.join(target_dl_dir, split_type) + split_dir = os.path.join(path_to_data, split_type) if not os.path.exists(split_dir): os.makedirs(split_dir) split_wav_dir = os.path.join(split_dir, "wav") @@ -78,7 +90,7 @@ def main(): for url in lst_libri_urls: # check if we want to dl this file dl_flag = False - for f in files_to_dl: + for f in filenames: if url.find(f) != -1: dl_flag = True if not dl_flag: From 7a5d5a51db9b38f0fc1141f4d6809dd62760827d Mon Sep 17 00:00:00 2001 From: ga38nif Date: Mon, 27 May 2019 00:29:04 +0200 Subject: [PATCH 10/58] add utils file and merge_manifest script from deepspeech --- .../datasets/download_datasets/data_utils.py | 43 +++++++++++++++++++ .../download_datasets/merge_manifests.py | 31 +++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 sonosco/datasets/download_datasets/data_utils.py create mode 100644 sonosco/datasets/download_datasets/merge_manifests.py diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py new file mode 100644 index 0000000..ae2cc68 --- /dev/null +++ b/sonosco/datasets/download_datasets/data_utils.py @@ -0,0 +1,43 @@ +from __future__ import print_function + +import fnmatch +import io +import os +from tqdm import tqdm +import subprocess +import torch.distributed as dist + + +def create_manifest(data_path, output_path, min_duration=None, max_duration=None): + file_paths = [os.path.join(dirpath, f) + for dirpath, dirnames, files in os.walk(data_path) + for f in fnmatch.filter(files, '*.wav')] + file_paths = order_and_prune_files(file_paths, min_duration, max_duration) + with io.FileIO(output_path, "w") as file: + for wav_path in tqdm(file_paths, total=len(file_paths)): + transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt') + sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n' + file.write(sample.encode('utf-8')) + print('\n') + + +def order_and_prune_files(file_paths, min_duration, max_duration): + print("Sorting manifests...") + duration_file_paths = [(path, float(subprocess.check_output( + ['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths] + if min_duration and max_duration: + print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration)) + duration_file_paths = [(path, duration) for path, duration in duration_file_paths if + min_duration <= duration <= max_duration] + + def func(element): + return element[1] + + duration_file_paths.sort(key=func) + return [x[0] for x in duration_file_paths] # Remove durations + +def reduce_tensor(tensor, world_size): + rt = tensor.clone() + dist.all_reduce(rt, op=dist.reduce_op.SUM) + rt /= world_size + return rt \ No newline at end of file diff --git a/sonosco/datasets/download_datasets/merge_manifests.py b/sonosco/datasets/download_datasets/merge_manifests.py new file mode 100644 index 0000000..e5e0fab --- /dev/null +++ b/sonosco/datasets/download_datasets/merge_manifests.py @@ -0,0 +1,31 @@ +from __future__ import print_function + +import argparse +import io +import os + +from tqdm import tqdm +from utils import order_and_prune_files + +parser = argparse.ArgumentParser(description='Merges all manifest CSV files in specified folder.') +parser.add_argument('--merge-dir', default='manifests/', help='Path to all manifest files you want to merge') +parser.add_argument('--min-duration', default=1, type=int, + help='Prunes any samples shorter than the min duration (given in seconds, default 1)') +parser.add_argument('--max-duration', default=15, type=int, + help='Prunes any samples longer than the max duration (given in seconds, default 15)') +parser.add_argument('--output-path', default='merged_manifest.csv', help='Output path to merged manifest') + +args = parser.parse_args() + +file_paths = [] +for file in os.listdir(args.merge_dir): + if file.endswith(".csv"): + with open(os.path.join(args.merge_dir, file), 'r') as fh: + file_paths += fh.readlines() +file_paths = [file_path.split(',')[0] for file_path in file_paths] +file_paths = order_and_prune_files(file_paths, args.min_duration, args.max_duration) +with io.FileIO(args.output_path, "w") as file: + for wav_path in tqdm(file_paths, total=len(file_paths)): + transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt') + sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n' + file.write(sample.encode('utf-8')) \ No newline at end of file From d6163b755f99ded5015c382d7bec0f923d9b601d Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Mon, 27 May 2019 00:30:37 +0200 Subject: [PATCH 11/58] =?UTF-8?q?Minor=20fixed=CB=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- sonosco/__init__.py | 0 sonosco/config/test.yaml | 2 +- sonosco/loader.py | 6 ++++-- sonosco/utils.py | 10 ++++++++-- 5 files changed, 14 insertions(+), 6 deletions(-) create mode 100644 sonosco/__init__.py diff --git a/requirements.txt b/requirements.txt index 19ce43b..bba5fc7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -apex==0.1 +#apex==0.1 audioread==2.1.7 cycler==0.10.0 decorator==4.4.0 diff --git a/sonosco/__init__.py b/sonosco/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/config/test.yaml b/sonosco/config/test.yaml index 2589e15..7609ef1 100644 --- a/sonosco/config/test.yaml +++ b/sonosco/config/test.yaml @@ -4,6 +4,6 @@ test: batch_size: 32 # Batch size for testing num_workers: 4 # Number of workers used in loading verbose: True # Print out decoded output and error of each sample - save_output: Trur # Saves output of model from test + save_output: True # Saves output of model from test output_path: "" # Where to save raw acoustic output diff --git a/sonosco/loader.py b/sonosco/loader.py index 00b06d1..b73b065 100644 --- a/sonosco/loader.py +++ b/sonosco/loader.py @@ -13,8 +13,10 @@ import torchaudio from scipy import signal from torch.utils.data import Dataset, DataLoader, Sampler -from torch.distributed import get_rank -from torch.distributed import get_world_size + +# FIXME: Deprecated functions usage +from torch.distributed.deprecated import get_rank +from torch.distributed.deprecated import get_world_size windows = {"bartlett": torch.bartlett_window, "blackman": torch.blackman_window, diff --git a/sonosco/utils.py b/sonosco/utils.py index 5e219d1..72d8353 100644 --- a/sonosco/utils.py +++ b/sonosco/utils.py @@ -1,5 +1,11 @@ import torch -from apex.fp16_utils import BN_convert_float + +try: + from apex.fp16_utils import BN_convert_float +except Exception as e: + print(f"Apex import failed: {e}") + + import torch.distributed as dist from models.deepspeech2 import DeepSpeech2 @@ -43,7 +49,7 @@ def check_loss(loss, loss_value): def load_model(device, model_path, is_cuda): - model = DeepSpeech.load_model(model_path) + model = DeepSpeech2.load_model(model_path) model.eval() model = model.to(device) if is_cuda and model.mixed_precision: From b4c70bbdb00865a537d2890317be14f410f4e311 Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Thu, 30 May 2019 19:11:47 +0200 Subject: [PATCH 12/58] Added install script plus slight refactornig --- .gitignore | 2 +- install_dependencies.sh | 43 +++++++++++++++++++++++++++++++++++++++++ post_requirements.txt | 2 ++ requirements.txt | 3 --- setup.py | 1 + 5 files changed, 47 insertions(+), 4 deletions(-) create mode 100755 install_dependencies.sh create mode 100644 post_requirements.txt diff --git a/.gitignore b/.gitignore index 7a0e43a..ba47f6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Created by .ignore support plugin (hsz.mobi) .idea/ - +warp-ctc/ ### Python template # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/install_dependencies.sh b/install_dependencies.sh new file mode 100755 index 0000000..bea0827 --- /dev/null +++ b/install_dependencies.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +#This scripts assumes that you have a virtual env in ./venv, you can override this by ./install_dependencies.sh -p /some/other/path + +set -e + +# define arguments +for i in "$@" +do +case ${i} in + -c=*|--cuda=*) + CUDA="${i#*=}" + shift # past argument=value + ;; + -p=*|--python_path=*) + PYTHON_HOME_PATH="${i#*=}" + shift # past argument=value + ;; + *) + # unknown option + ;; +esac +done + +PYTHON_HOME_PATH=${PYTHON_HOME_PATH:-./venv} +#TODO: Infer this automatically +CUDA=${CUDA:-false} +source ${PYTHON_HOME_PATH}/bin/activate + +pip install -r requirements.txt + +git clone https://github.com/SeanNaren/warp-ctc.git +if [ "$CUDA" = false ] ; then + sed -i '' 's/option(WITH_OMP \"compile warp-ctc with openmp.\" ON)/option(WITH_OMP \"compile warp-ctc with openmp.\" ${CUDA_FOUND})/' warp-ctc/CMakeLists.txt +else + export CUDA_HOME="/usr/local/cuda" +fi +cd warp-ctc; mkdir build; cd build; cmake ..; make +cd ../pytorch_binding && python setup.py install +cd ../.. +rm -rf warp-ctc + +pip install -r post_requirements.txt \ No newline at end of file diff --git a/post_requirements.txt b/post_requirements.txt new file mode 100644 index 0000000..f4c027f --- /dev/null +++ b/post_requirements.txt @@ -0,0 +1,2 @@ +-e git://github.com/pytorch/audio.git#egg=torchaudio-0.2 +-e git://github.com/NVIDIA/apex.git#egg=apex diff --git a/requirements.txt b/requirements.txt index bba5fc7..46e6bd6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -#apex==0.1 audioread==2.1.7 cycler==0.10.0 decorator==4.4.0 @@ -17,9 +16,7 @@ scikit-learn==0.21.2 scipy==1.3.0 six==1.12.0 torch==1.1.0 -torchaudio==0.2 torchvision==0.3.0 tqdm==4.32.1 pyyaml==5.1 wget==3.2 --e git+git@github.com:SeanNaren/warp-ctc.git@ab27454d0cf3f936a6e19165682fe2b759f3f8e5#egg=warpctc_pytorch&subdirectory=pytorch_binding diff --git a/setup.py b/setup.py index bdb1387..1dc7d35 100644 --- a/setup.py +++ b/setup.py @@ -6,4 +6,5 @@ author="The Roboy Gang", packages=["sonosco"], include_package_data=True, + dependency_links=['http://github.com/pytorch/audio/tarball/master#egg=torchaudio-0.2'] ) From 60f750030aa44393a50f90254f6b78ba3515e72c Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Thu, 30 May 2019 22:05:59 +0200 Subject: [PATCH 13/58] Added Dockerfile, started PyCandle integration --- .dockerignore | 127 ++++++++++++++++++++++++++++++++++++++ .gitignore | 5 ++ Dockerfile | 38 ++++++++++++ install_dependencies.sh | 15 ++++- requirements.txt | 42 ++++++------- sonosco/pycandle_train.py | 18 ++++++ 6 files changed, 221 insertions(+), 24 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 sonosco/pycandle_train.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f8f2779 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,127 @@ +# Created by .ignore support plugin (hsz.mobi) +### VirtualEnv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +pip-selfcheck.json +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +### Example user template template +### Example user template + +# IntelliJ project files +.idea +*.iml +out +gen diff --git a/.gitignore b/.gitignore index ba47f6f..58898a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,9 @@ # Created by .ignore support plugin (hsz.mobi) +sonosco/pycandle/ +sonosco/pycandle +sonosco/datasets/download_datasets/ +!sonosco/datasets/download_datasets/*.py + .idea/ warp-ctc/ ### Python template diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8f57494 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel +ARG CUDA=false + + +WORKDIR /workspace/ +COPY . . +# install basics +RUN apt-get update -y +RUN apt-get install -y git curl ca-certificates bzip2 cmake tree htop bmon iotop sox libsox-dev libsox-fmt-all vim wget + +ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + +# install python deps +RUN pip install -r requirements.txt + +RUN rm -rf warp-ctc +RUN git clone https://github.com/SeanNaren/warp-ctc.git +RUN if [ "$CUDA" = false ] ; then sed -i 's/option(WITH_OMP \"compile warp-ctc with openmp.\" ON)/option(WITH_OMP \"compile warp-ctc with openmp.\" ${CUDA_FOUND})/' warp-ctc/CMakeLists.txt ; else export CUDA_HOME="/usr/local/cuda" ; fi +RUN cd warp-ctc; mkdir build; cd build; cmake ..; make +RUN cd warp-ctc/pytorch_binding && python setup.py install +RUN rm -rf warp-ctc + +RUN pip install -r post_requirements.txt + + +#TODO: Do we need those two below? +# install ctcdecode +#RUN git clone --recursive https://github.com/parlance/ctcdecode.git +#RUN cd ctcdecode; pip install . + +# install deepspeech.pytorch +ADD . /workspace/deepspeech.pytorch +RUN cd deepspeech.pytorch; pip install -r requirements.txt + +# launch jupiter +RUN pip install jupyter +RUN mkdir data; mkdir notebooks; +CMD jupyter-notebook --ip="*" --no-browser --allow-root \ No newline at end of file diff --git a/install_dependencies.sh b/install_dependencies.sh index bea0827..571c767 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -12,8 +12,12 @@ case ${i} in CUDA="${i#*=}" shift # past argument=value ;; + -e=*|--venv=*) + VENV="${i#*=}" + shift # past argument=value + ;; -p=*|--python_path=*) - PYTHON_HOME_PATH="${i#*=}" + VENV_PATH="${i#*=}" shift # past argument=value ;; *) @@ -22,10 +26,15 @@ case ${i} in esac done -PYTHON_HOME_PATH=${PYTHON_HOME_PATH:-./venv} +VENV=${VENV:-true} + +if [ "$VENV" = true ] ; then + VENV_PATH=${VENV_PATH:-./venv} + source ${VENV_PATH}/bin/activate +fi + #TODO: Infer this automatically CUDA=${CUDA:-false} -source ${PYTHON_HOME_PATH}/bin/activate pip install -r requirements.txt diff --git a/requirements.txt b/requirements.txt index 46e6bd6..bd62925 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,22 @@ -audioread==2.1.7 -cycler==0.10.0 -decorator==4.4.0 -joblib==0.13.2 -kiwisolver==1.1.0 -librosa==0.6.3 -llvmlite==0.28.0 -matplotlib==3.1.0 -numba==0.43.1 -numpy==1.16.3 -Pillow==6.0.0 -pyparsing==2.4.0 -python-dateutil==2.8.0 -resampy==0.2.1 -scikit-learn==0.21.2 -scipy==1.3.0 -six==1.12.0 +#audioread==2.1.7 +#cycler==0.10.0 +#decorator==4.4.0 +#joblib==0.13.2 +#kiwisolver==1.1.0 +#librosa==0.6.3 +#llvmlite==0.28.0 +#matplotlib==3.1.0 +#numba==0.43.1 +#numpy==1.16.3 +#Pillow==6.0.0 +#pyparsing==2.4.0 +#python-dateutil==2.8.0 +#resampy==0.2.1 +#scikit-learn==0.21.2 +#scipy==1.3.0 +#six==1.12.0 torch==1.1.0 -torchvision==0.3.0 -tqdm==4.32.1 -pyyaml==5.1 -wget==3.2 +#torchvision==0.3.0 +#tqdm==4.32.1 +#pyyaml==5.1 +#wget==3.2 diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py new file mode 100644 index 0000000..9b08b19 --- /dev/null +++ b/sonosco/pycandle_train.py @@ -0,0 +1,18 @@ +import torch +import torchvision +import torch.nn.functional as F + +from sonosco.models.deepspeech2 import DeepSpeech2 +from sonosco.pycandle.general.experiment import Experiment +from sonosco.pycandle.training.model_trainer import ModelTrainer + + +def load_datasets(batch_size_train, batch_size_test): + pass + +model = DeepSpeech2().cuda() +experiment = Experiment('mnist_example') +train_loader, val_loader = load_datasets(batch_size_train=64, batch_size_test=64) +optimizer = torch.optim.Adam(model.parameters(), lr=0.01) +model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0) +model_trainer.start_training() From 824e97bec1fb37f8a0e81700e7a0429b7530c61c Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sat, 1 Jun 2019 13:35:32 +0200 Subject: [PATCH 14/58] =?UTF-8?q?Added=20simple=20train=20function=CB=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + install_dependencies.sh | 1 + sonosco/pycandle_train.py | 25 ++++++++++++++++++++----- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 58898a1..14dcffc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Created by .ignore support plugin (hsz.mobi) sonosco/pycandle/ sonosco/pycandle +sonosco/experiments/ sonosco/datasets/download_datasets/ !sonosco/datasets/download_datasets/*.py diff --git a/install_dependencies.sh b/install_dependencies.sh index 571c767..4cdbedd 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -40,6 +40,7 @@ pip install -r requirements.txt git clone https://github.com/SeanNaren/warp-ctc.git if [ "$CUDA" = false ] ; then + # This works for mac, for other OSes remove '' after -i sed -i '' 's/option(WITH_OMP \"compile warp-ctc with openmp.\" ON)/option(WITH_OMP \"compile warp-ctc with openmp.\" ${CUDA_FOUND})/' warp-ctc/CMakeLists.txt else export CUDA_HOME="/usr/local/cuda" diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py index 9b08b19..ddb233e 100644 --- a/sonosco/pycandle_train.py +++ b/sonosco/pycandle_train.py @@ -1,18 +1,33 @@ import torch -import torchvision import torch.nn.functional as F +import sonosco.datasets.download_datasets.librispeech as librispeech +from sonosco.datasets.AudioDataLoader import AudioDataLoader +from sonosco.datasets.AudioDataSampler import BucketingSampler +from sonosco.datasets.AudioDataset import AudioDataset from sonosco.models.deepspeech2 import DeepSpeech2 from sonosco.pycandle.general.experiment import Experiment from sonosco.pycandle.training.model_trainer import ModelTrainer +def load_datasets(manifest_path, batch_size_train, batch_size_test): + audio_conf = dict(sample_rate=16000, + window_size=.02, + window_stride=.01, + window='hamming') + labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=manifest_path, labels=labels, + normalize=False, augment=False) + print("Dataset is created\n====================\n") -def load_datasets(batch_size_train, batch_size_test): - pass + batch_size = 16 + sampler = BucketingSampler(test_dataset, batch_size=batch_size) + return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler) -model = DeepSpeech2().cuda() +librispeech.main() +model = DeepSpeech2().cpu() experiment = Experiment('mnist_example') -train_loader, val_loader = load_datasets(batch_size_train=64, batch_size_test=64) +train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv", + batch_size_train=64, batch_size_test=64) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0) model_trainer.start_training() From 97d65ce3279163418ccb97ee2596488676346a9c Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sat, 1 Jun 2019 14:05:58 +0200 Subject: [PATCH 15/58] =?UTF-8?q?Import=20fixed=20=CB=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sonosco/datasets/download_datasets/librispeech.py | 2 +- sonosco/pycandle_train.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index b9c8fc4..31773cf 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -3,7 +3,7 @@ import tarfile import argparse import subprocess -from data_utils import create_manifest +from datasets.download_datasets.data_utils import create_manifest from tqdm import tqdm import shutil diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py index ddb233e..2d02796 100644 --- a/sonosco/pycandle_train.py +++ b/sonosco/pycandle_train.py @@ -1,13 +1,13 @@ import torch import torch.nn.functional as F -import sonosco.datasets.download_datasets.librispeech as librispeech +import datasets.download_datasets.librispeech as librispeech -from sonosco.datasets.AudioDataLoader import AudioDataLoader -from sonosco.datasets.AudioDataSampler import BucketingSampler -from sonosco.datasets.AudioDataset import AudioDataset -from sonosco.models.deepspeech2 import DeepSpeech2 -from sonosco.pycandle.general.experiment import Experiment -from sonosco.pycandle.training.model_trainer import ModelTrainer +from datasets.AudioDataLoader import AudioDataLoader +from datasets.AudioDataSampler import BucketingSampler +from datasets.AudioDataset import AudioDataset +from models.deepspeech2 import DeepSpeech2 +from pycandle.general.experiment import Experiment +from pycandle.training.model_trainer import ModelTrainer def load_datasets(manifest_path, batch_size_train, batch_size_test): audio_conf = dict(sample_rate=16000, From d328973cc0370be6acca11ce28dbc07dedeec762 Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sat, 1 Jun 2019 15:52:32 +0200 Subject: [PATCH 16/58] Uncommented requirements --- requirements.txt | 42 +++++++++++++++++++++--------------------- setup.py | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/requirements.txt b/requirements.txt index bd62925..46e6bd6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,22 +1,22 @@ -#audioread==2.1.7 -#cycler==0.10.0 -#decorator==4.4.0 -#joblib==0.13.2 -#kiwisolver==1.1.0 -#librosa==0.6.3 -#llvmlite==0.28.0 -#matplotlib==3.1.0 -#numba==0.43.1 -#numpy==1.16.3 -#Pillow==6.0.0 -#pyparsing==2.4.0 -#python-dateutil==2.8.0 -#resampy==0.2.1 -#scikit-learn==0.21.2 -#scipy==1.3.0 -#six==1.12.0 +audioread==2.1.7 +cycler==0.10.0 +decorator==4.4.0 +joblib==0.13.2 +kiwisolver==1.1.0 +librosa==0.6.3 +llvmlite==0.28.0 +matplotlib==3.1.0 +numba==0.43.1 +numpy==1.16.3 +Pillow==6.0.0 +pyparsing==2.4.0 +python-dateutil==2.8.0 +resampy==0.2.1 +scikit-learn==0.21.2 +scipy==1.3.0 +six==1.12.0 torch==1.1.0 -#torchvision==0.3.0 -#tqdm==4.32.1 -#pyyaml==5.1 -#wget==3.2 +torchvision==0.3.0 +tqdm==4.32.1 +pyyaml==5.1 +wget==3.2 diff --git a/setup.py b/setup.py index 1dc7d35..207c04e 100644 --- a/setup.py +++ b/setup.py @@ -6,5 +6,5 @@ author="The Roboy Gang", packages=["sonosco"], include_package_data=True, - dependency_links=['http://github.com/pytorch/audio/tarball/master#egg=torchaudio-0.2'] + dependency_links=[] ) From 993e5ecfa41042491f0455d6e63c669f8587588e Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sat, 1 Jun 2019 18:50:17 +0200 Subject: [PATCH 17/58] =?UTF-8?q?Fixed=20args=CB=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ Dockerfile | 2 +- sonosco/datasets/AudioDataLoader.py | 4 ++-- sonosco/pycandle_train.py | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 14dcffc..453b1ff 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ sonosco/experiments/ sonosco/datasets/download_datasets/ !sonosco/datasets/download_datasets/*.py +**/.DS_Store + .idea/ warp-ctc/ ### Python template diff --git a/Dockerfile b/Dockerfile index 8f57494..3b9c9ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,4 +35,4 @@ RUN cd deepspeech.pytorch; pip install -r requirements.txt # launch jupiter RUN pip install jupyter RUN mkdir data; mkdir notebooks; -CMD jupyter-notebook --ip="*" --no-browser --allow-root \ No newline at end of file +#CMD jupyter-notebook --ip="*" --no-browser --allow-root \ No newline at end of file diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py index 3161abd..8786c2d 100644 --- a/sonosco/datasets/AudioDataLoader.py +++ b/sonosco/datasets/AudioDataLoader.py @@ -12,8 +12,8 @@ def __init__(self, *args, **kwargs): super(AudioDataLoader, self).__init__(*args, **kwargs) self.collate_fn = self._collate_fn -# TODO: Optimise - def _collate_fn(batch): + # TODO: Optimise + def _collate_fn(self, batch): batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) longest_sample = batch[0][0] freq_size, max_seqlength = longest_sample.size() diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py index 2d02796..7310789 100644 --- a/sonosco/pycandle_train.py +++ b/sonosco/pycandle_train.py @@ -1,6 +1,6 @@ import torch import torch.nn.functional as F -import datasets.download_datasets.librispeech as librispeech +#import datasets.download_datasets.librispeech as librispeech from datasets.AudioDataLoader import AudioDataLoader from datasets.AudioDataSampler import BucketingSampler @@ -23,7 +23,7 @@ def load_datasets(manifest_path, batch_size_train, batch_size_test): sampler = BucketingSampler(test_dataset, batch_size=batch_size) return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler) -librispeech.main() +# librispeech.main() model = DeepSpeech2().cpu() experiment = Experiment('mnist_example') train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv", From 4135da1b7402d64d998e6ee854b4c82c8629ff01 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Tue, 4 Jun 2019 14:14:07 +0200 Subject: [PATCH 18/58] start refactoring librispeech data download --- sonosco/common/__init__.py | 0 sonosco/common/audio_tools.py | 9 +++ sonosco/common/click_extensions.py | 16 ++++ sonosco/common/utils.py | 21 +++++ .../datasets/download_datasets/data_utils.py | 35 ++++---- .../datasets/download_datasets/librispeech.py | 80 +++++++++---------- 6 files changed, 102 insertions(+), 59 deletions(-) create mode 100644 sonosco/common/__init__.py create mode 100644 sonosco/common/audio_tools.py create mode 100644 sonosco/common/click_extensions.py create mode 100644 sonosco/common/utils.py diff --git a/sonosco/common/__init__.py b/sonosco/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py new file mode 100644 index 0000000..21c811b --- /dev/null +++ b/sonosco/common/audio_tools.py @@ -0,0 +1,9 @@ +import subprocess + + +def get_duration(file_path): + return float(subprocess.check_output([f'soxi -D "{file_path.strip()}"'], shell=True)) + + +def transcode_recording(source, destination, sample_rate): + subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination}"], shell=True) diff --git a/sonosco/common/click_extensions.py b/sonosco/common/click_extensions.py new file mode 100644 index 0000000..18ff96f --- /dev/null +++ b/sonosco/common/click_extensions.py @@ -0,0 +1,16 @@ +import click +import ast +import logging + + +logger = logging.getLogger(__name__) + + +class PythonLiteralOption(click.Option): + + def type_cast_value(self, ctx, value): + try: + return ast.literal_eval(value) + except Exception as e: + logger.error(f"Malformed click input for PythonLiteralOption {e}", exc_info=True) + raise click.BadParameter(value) diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py new file mode 100644 index 0000000..a570af0 --- /dev/null +++ b/sonosco/common/utils.py @@ -0,0 +1,21 @@ +import logging +import os + + +def setup_logging(logger: logging.Logger, filename=None, verbosity=False): + logger.setLevel(logging.DEBUG) + if filename is not None: + log_directory = os.path.dirname(filename) + if not os.path.exists(log_directory): + os.makedirs(log_directory) + filename = os.path.join(log_directory, f"{filename}.log") + f_handler = logging.FileHandler(filename=filename, mode="w") + f_handler.setLevel(logging.DEBUG) + f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + f_handler.setFormatter(f_format) + logger.addHandler(f_handler) + c_handler = logging.StreamHandler() + c_handler.setLevel(logging.DEBUG) if verbosity else c_handler.setLevel(logging.INFO) + c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + c_handler.setFormatter(c_format) + logger.addHandler(c_handler) diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py index ae2cc68..e7e15d5 100644 --- a/sonosco/datasets/download_datasets/data_utils.py +++ b/sonosco/datasets/download_datasets/data_utils.py @@ -1,43 +1,44 @@ -from __future__ import print_function - import fnmatch import io import os -from tqdm import tqdm -import subprocess +import logging import torch.distributed as dist +import sonosco.common.audio_tools as audio_tools + +from tqdm import tqdm + +logger = logging.getLogger(__name__) def create_manifest(data_path, output_path, min_duration=None, max_duration=None): + logger.info(f"Creating a manifest for path: {data_path}") file_paths = [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(data_path) for f in fnmatch.filter(files, '*.wav')] + logger.info(f"Found {len(file_paths)} .wav files") file_paths = order_and_prune_files(file_paths, min_duration, max_duration) with io.FileIO(output_path, "w") as file: for wav_path in tqdm(file_paths, total=len(file_paths)): transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt') - sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n' + sample = f"{os.path.abspath(wav_path)},{os.path.abspath(transcript_path)}\n" file.write(sample.encode('utf-8')) - print('\n') def order_and_prune_files(file_paths, min_duration, max_duration): - print("Sorting manifests...") - duration_file_paths = [(path, float(subprocess.check_output( - ['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths] + logger.info("Sorting manifests...") + path_and_duration = [(path, audio_tools.get_duration(path)) for path in file_paths] + if min_duration and max_duration: - print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration)) - duration_file_paths = [(path, duration) for path, duration in duration_file_paths if - min_duration <= duration <= max_duration] + logger.info(f"Pruning manifests between {min_duration} and {max_duration} seconds") + path_and_duration = [(path, duration) for path, duration in path_and_duration + if min_duration <= duration <= max_duration] - def func(element): - return element[1] + path_and_duration.sort(key=lambda e: e[1]) + return [x[0] for x in path_and_duration] - duration_file_paths.sort(key=func) - return [x[0] for x in duration_file_paths] # Remove durations def reduce_tensor(tensor, world_size): rt = tensor.clone() dist.all_reduce(rt, op=dist.reduce_op.SUM) rt /= world_size - return rt \ No newline at end of file + return rt diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index 31773cf..7f8295b 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -1,25 +1,19 @@ import os +import click import wget import tarfile -import argparse -import subprocess -from datasets.download_datasets.data_utils import create_manifest -from tqdm import tqdm import shutil +import logging +import sonosco.common.audio_tools as audio_tools + +from sonosco.datasets.download_datasets.data_utils import create_manifest +from sonosco.common.click_extensions import PythonLiteralOption +from sonosco.common.utils import setup_logging +from tqdm import tqdm + + +logger = logging.getLogger("sonosco.datasets.download_datasets.librispeech") -parser = argparse.ArgumentParser(description='Processes and downloads LibriSpeech dataset.') -parser.add_argument("--target-dir", default='LibriSpeech_dataset/', type=str, help="Directory to store the dataset.") -parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate') -parser.add_argument('--files-to-use', default="train-clean-100.tar.gz," - "train-clean-360.tar.gz,train-other-500.tar.gz," - "dev-clean.tar.gz,dev-other.tar.gz," - "test-clean.tar.gz,test-other.tar.gz", type=str, - help='list of file names to download') -parser.add_argument('--min-duration', default=1, type=int, - help='Prunes training samples shorter than the min duration (given in seconds, default 1)') -parser.add_argument('--max-duration', default=15, type=int, - help='Prunes training samples longer than the max duration (given in seconds, default 15)') -args = parser.parse_args() LIBRI_SPEECH_URLS = { #"train": ["http://www.openslr.org/resources/12/train-clean-100.tar.gz", @@ -38,12 +32,11 @@ def _preprocess_transcript(phrase): return phrase.strip().upper() -def _process_file(wav_dir, txt_dir, base_filename, root_dir): +def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate): full_recording_path = os.path.join(root_dir, base_filename) assert os.path.exists(full_recording_path) and os.path.exists(root_dir) wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav")) - subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate), - wav_recording_path)], shell=True) + audio_tools.transcode_recording(full_recording_path, wav_recording_path, sample_rate) # process transcript txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt")) transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt") @@ -57,20 +50,22 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir): f.flush() -def main(): - root = os.path.expanduser('~') - data_path = '.temp/data/libri' - - filenames = [ - 'train-clean-100.tar.gz', - 'train-clean-360.tar.gz', - 'train-other-500.tar.gz', - 'dev-clean.tar.gz', - 'dev-other.tar.gz', - 'test-clean.tar.gz', - 'test-other.tar.gz' - ] - path_to_data = os.path.join(root, data_path) +@click.command() +@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.") +@click.option("--sample-rate", default=16000, type=int, help="Sample rate.") +@click.option("--files-to-use", multiple=True, + default=["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz", + "dev-clean.tar.gz", "dev-other.tar.gz", "test-clean.tar.gz", "test-other.tar.gz"], + type=str, help="List of file names to download.") +@click.option("--min-duration", default=1, type=int, + help="Prunes training samples shorter than the min duration (given in seconds).") +@click.option("--max-duration", default=15, type=int, + help="Prunes training samples longer than the max duration (given in seconds).") +def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): + """Processes and downloads LibriSpeech dataset.""" + setup_logging(logger) + + path_to_data = os.path.join(os.path.expanduser("~"), target_dir) if not os.path.exists(path_to_data): os.makedirs(path_to_data) @@ -90,36 +85,37 @@ def main(): for url in lst_libri_urls: # check if we want to dl this file dl_flag = False - for f in filenames: + for f in files_to_use: if url.find(f) != -1: dl_flag = True if not dl_flag: - print("Skipping url: {}".format(url)) + logger.info(f"Skipping url: {url}") continue filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) if not os.path.exists(target_filename): wget.download(url, split_dir) - print("Unpacking {}...".format(filename)) + logger.info("Download complete") + logger.info(f"Unpacking {filename}...") tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) - print("Converting flac files to wav and extracting transcripts...") + logger.info("Converting flac files to wav and extracting transcripts...") assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename) for root, subdirs, files in tqdm(os.walk(extracted_dir)): for f in files: if f.find(".flac") != -1: _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, - base_filename=f, root_dir=root) + base_filename=f, root_dir=root, sample_rate=sample_rate) - print("Finished {}".format(url)) + logger.info(f"Finished {url}") shutil.rmtree(extracted_dir) if split_type == 'train': # Prune to min/max duration - create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration) + create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', min_duration, max_duration) else: create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv') if __name__ == "__main__": - main() \ No newline at end of file + main() From e5ff4c3024c61985c431ec9265c014d856b702d2 Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Wed, 5 Jun 2019 01:40:55 +0200 Subject: [PATCH 19/58] Added conda local install, partialy adjusted model to pycandle --- install_dependencies.sh | 22 ++++++++++++++-------- post_requirements.txt | 1 - sonosco/datasets/AudioDataLoader.py | 5 +++-- sonosco/models/deepspeech2.py | 12 ++++++++---- sonosco/pycandle_train.py | 2 +- 5 files changed, 26 insertions(+), 16 deletions(-) diff --git a/install_dependencies.sh b/install_dependencies.sh index 4cdbedd..d4ad0a0 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -12,12 +12,12 @@ case ${i} in CUDA="${i#*=}" shift # past argument=value ;; - -e=*|--venv=*) - VENV="${i#*=}" + -a=*|--anaconda=*) + ANACONDA="${i#*=}" shift # past argument=value ;; - -p=*|--python_path=*) - VENV_PATH="${i#*=}" + -e=*|--venv=*) + VENV="${i#*=}" shift # past argument=value ;; *) @@ -26,11 +26,12 @@ case ${i} in esac done -VENV=${VENV:-true} +VENV=${VENV:-./venv} -if [ "$VENV" = true ] ; then - VENV_PATH=${VENV_PATH:-./venv} - source ${VENV_PATH}/bin/activate +if [ -z ${ANACONDA+x} ] ; then + conda activate ${ANACONDA} +elif [ -z ${VENV+x} ] ; then + source ${VENV}/bin/activate fi #TODO: Infer this automatically @@ -50,4 +51,9 @@ cd ../pytorch_binding && python setup.py install cd ../.. rm -rf warp-ctc +git clone git@github.com:pytorch/audio.git +cd audio; MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install +cd .. +rm -rf audio + pip install -r post_requirements.txt \ No newline at end of file diff --git a/post_requirements.txt b/post_requirements.txt index f4c027f..f750f4b 100644 --- a/post_requirements.txt +++ b/post_requirements.txt @@ -1,2 +1 @@ --e git://github.com/pytorch/audio.git#egg=torchaudio-0.2 -e git://github.com/NVIDIA/apex.git#egg=apex diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py index 8786c2d..577d1ed 100644 --- a/sonosco/datasets/AudioDataLoader.py +++ b/sonosco/datasets/AudioDataLoader.py @@ -29,6 +29,7 @@ def _collate_fn(self, batch): inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) target_sizes[x] = len(batch[x][1]) - targets.extend(batch[x][1]) + targets.append([batch[x][1]]) - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) \ No newline at end of file + # return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) + return (inputs, input_percentages), torch.IntTensor(targets) \ No newline at end of file diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py index dcfa100..5d2dff8 100644 --- a/sonosco/models/deepspeech2.py +++ b/sonosco/models/deepspeech2.py @@ -82,13 +82,14 @@ def forward(self, input_): class BatchRNN(nn.Module): - def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True): + def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=True): super(BatchRNN, self).__init__() + self.bidirectional = bidirectional self.input_size = input_size self.hidden_size = hidden_size self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, - bidirectional=True, bias=True) + bidirectional=self.bidirectional, bias=True) def flatten_parameters(self): self.rnn.flatten_parameters() @@ -155,9 +156,12 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5 self.inference_softmax = InferenceBatchSoftmax() - def forward(self, x, lengths): + def forward(self, xx): # if x.is_cuda and self.mixed_precision: # x = x.half() + x, input_percentages = xx + + lengths = input_percentages.mul_(int(x.size(3))).int() lengths = lengths.cpu().int() output_lengths = self.get_seq_lens(lengths) x, _ = self.conv(x, output_lengths) @@ -176,7 +180,7 @@ def forward(self, x, lengths): x = x.transpose(0, 1) # identity in training mode, softmax in eval mode x = self.inference_softmax(x) - return x, output_lengths + return x def get_seq_lens(self, input_length): """ diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py index 7310789..9645dfa 100644 --- a/sonosco/pycandle_train.py +++ b/sonosco/pycandle_train.py @@ -29,5 +29,5 @@ def load_datasets(manifest_path, batch_size_train, batch_size_test): train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv", batch_size_train=64, batch_size_test=64) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) -model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0) +model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=None) model_trainer.start_training() From f0fc4a92e75c7532c60462fc9b9a45e68ec45d7e Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Wed, 5 Jun 2019 02:17:01 +0200 Subject: [PATCH 20/58] finish refactoring librispeech download --- sonosco/common/path_utils.py | 12 +++++++ .../datasets/download_datasets/data_utils.py | 1 + .../datasets/download_datasets/librispeech.py | 36 ++++++++++--------- 3 files changed, 33 insertions(+), 16 deletions(-) create mode 100644 sonosco/common/path_utils.py diff --git a/sonosco/common/path_utils.py b/sonosco/common/path_utils.py new file mode 100644 index 0000000..bf8c8ef --- /dev/null +++ b/sonosco/common/path_utils.py @@ -0,0 +1,12 @@ +import os +import wget + + +def try_create_directory(path: str): + if not os.path.exists(path): + os.makedirs(path) + + +def try_download(destination: str, url: str): + if not os.path.exists(destination): + wget.download(url, destination) diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py index e7e15d5..e6a5cd8 100644 --- a/sonosco/datasets/download_datasets/data_utils.py +++ b/sonosco/datasets/download_datasets/data_utils.py @@ -7,6 +7,7 @@ from tqdm import tqdm +import pdb; pdb.set_trace() logger = logging.getLogger(__name__) diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index 7f8295b..38b3f61 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -1,18 +1,17 @@ import os import click -import wget import tarfile import shutil import logging import sonosco.common.audio_tools as audio_tools +import sonosco.common.path_utils as path_utils from sonosco.datasets.download_datasets.data_utils import create_manifest -from sonosco.common.click_extensions import PythonLiteralOption from sonosco.common.utils import setup_logging from tqdm import tqdm -logger = logging.getLogger("sonosco.datasets.download_datasets.librispeech") +logger = logging.getLogger("sonosco") LIBRI_SPEECH_URLS = { @@ -40,12 +39,12 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate): # process transcript txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt")) transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt") - assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file) + assert os.path.exists(transcript_file), f"Transcript file {transcript_file} does not exist" transcriptions = open(transcript_file).read().strip().split("\n") transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions} with open(txt_transcript_path, "w") as f: key = base_filename.replace(".flac", "").split("-")[-1] - assert key in transcriptions, "{} is not in the transcriptions".format(key) + assert key in transcriptions, f"{key} is not in the transcriptions" f.write(_preprocess_transcript(transcriptions[key])) f.flush() @@ -71,17 +70,16 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): split_dir = os.path.join(path_to_data, split_type) - if not os.path.exists(split_dir): - os.makedirs(split_dir) + path_utils.try_create_directory(split_dir) split_wav_dir = os.path.join(split_dir, "wav") - if not os.path.exists(split_wav_dir): - os.makedirs(split_wav_dir) + path_utils.try_create_directory(split_wav_dir) split_txt_dir = os.path.join(split_dir, "txt") - if not os.path.exists(split_txt_dir): - os.makedirs(split_txt_dir) + path_utils.try_create_directory(split_txt_dir) extracted_dir = os.path.join(split_dir, "LibriSpeech") + if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir) + for url in lst_libri_urls: # check if we want to dl this file dl_flag = False @@ -91,18 +89,19 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): if not dl_flag: logger.info(f"Skipping url: {url}") continue + filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) - if not os.path.exists(target_filename): - wget.download(url, split_dir) + path_utils.try_download(target_filename, url) logger.info("Download complete") logger.info(f"Unpacking {filename}...") tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) + assert os.path.exists(extracted_dir), f"Archive {filename} was not properly uncompressed" + logger.info("Converting flac files to wav and extracting transcripts...") - assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename) for root, subdirs, files in tqdm(os.walk(extracted_dir)): for f in files: if f.find(".flac") != -1: @@ -111,10 +110,15 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): logger.info(f"Finished {url}") shutil.rmtree(extracted_dir) + + manifest_path = os.path.join(path_to_data, f"libri_{split_type}_manifest.csv") + if os.path.exists(manifest_path): + continue + if split_type == 'train': # Prune to min/max duration - create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', min_duration, max_duration) + create_manifest(split_dir, manifest_path, min_duration, max_duration) else: - create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv') + create_manifest(split_dir, manifest_path) if __name__ == "__main__": From d849589b65b162122d1ebe1fc1a2fd0f66b78884 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Wed, 5 Jun 2019 02:29:59 +0200 Subject: [PATCH 21/58] move manifest to data dir --- sonosco/datasets/AudioDataLoader.py | 9 ++++----- sonosco/datasets/AudioDataset.py | 7 ++++++- sonosco/datasets/datasets_test_script.py | 6 +++++- sonosco/pycandle_train.py | 7 +++++-- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py index 8786c2d..f8f7337 100644 --- a/sonosco/datasets/AudioDataLoader.py +++ b/sonosco/datasets/AudioDataLoader.py @@ -1,14 +1,13 @@ - import numpy as np import torch from torch.utils.data import Dataset, DataLoader, Sampler + class AudioDataLoader(DataLoader): + def __init__(self, *args, **kwargs): - """ - Creates a data loader for AudioDatasets. - """ + """Creates a data loader for AudioDatasets.""" super(AudioDataLoader, self).__init__(*args, **kwargs) self.collate_fn = self._collate_fn @@ -31,4 +30,4 @@ def _collate_fn(self, batch): target_sizes[x] = len(batch[x][1]) targets.extend(batch[x][1]) - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) \ No newline at end of file + return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py index 0a95284..6aea72a 100644 --- a/sonosco/datasets/AudioDataset.py +++ b/sonosco/datasets/AudioDataset.py @@ -4,6 +4,7 @@ # ---------------------------------------------------------------------------- import warnings +import os from typing import Tuple import torch @@ -118,12 +119,14 @@ def __getitem__(self, index): def __len__(self): return self.size + def main(): audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, window='hamming') - test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv' + manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech") + test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels, normalize=False, augment=False) @@ -135,5 +138,7 @@ def main(): dataloader = DataLoader(dataset=test_dataset, num_workers=4, collate_fn=_collate_fn, batch_sampler=sampler) test_dataset[0] #inputs, targets, input_percentages, target_sizes = next(iter(dataloader)) + + if __name__ == "__main__": main() \ No newline at end of file diff --git a/sonosco/datasets/datasets_test_script.py b/sonosco/datasets/datasets_test_script.py index 7f8a11f..f4daf79 100644 --- a/sonosco/datasets/datasets_test_script.py +++ b/sonosco/datasets/datasets_test_script.py @@ -1,3 +1,5 @@ +import os + from AudioDataLoader import AudioDataLoader from AudioDataSampler import BucketingSampler, DistributedBucketingSampler from AudioDataset import AudioDataset @@ -12,7 +14,9 @@ def main(): window_size=.02, window_stride=.01, window='hamming') - test_manifest = '/Users/florianlay/data/libri_test_clean_manifest.csv' + + manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech") + test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") labels = 'abc' test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels, normalize=False, augment=False) diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py index 7310789..fc1fbbe 100644 --- a/sonosco/pycandle_train.py +++ b/sonosco/pycandle_train.py @@ -1,4 +1,5 @@ import torch +import os import torch.nn.functional as F #import datasets.download_datasets.librispeech as librispeech @@ -23,11 +24,13 @@ def load_datasets(manifest_path, batch_size_train, batch_size_test): sampler = BucketingSampler(test_dataset, batch_size=batch_size) return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler) + +manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech") +test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") # librispeech.main() model = DeepSpeech2().cpu() experiment = Experiment('mnist_example') -train_loader = load_datasets("./datasets/download_datasets/libri_test_clean_manifest.csv", - batch_size_train=64, batch_size_test=64) +train_loader = load_datasets(test_manifest, batch_size_train=64, batch_size_test=64) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=0) model_trainer.start_training() From 98f0943da99670a803cf1a6aa6b429082c7d845b Mon Sep 17 00:00:00 2001 From: ga38nif Date: Thu, 6 Jun 2019 00:48:14 +0200 Subject: [PATCH 22/58] start refactoring common_voice --- .../download_datasets/common_voice.py | 83 ++++++++++--------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py index fbc7b91..70451ab 100644 --- a/sonosco/datasets/download_datasets/common_voice.py +++ b/sonosco/datasets/download_datasets/common_voice.py @@ -1,27 +1,32 @@ import os import wget +import click +import logging import tarfile -import argparse +import shutil import csv +import sonosco.common.audio_tools as audio_tools +import sonosco.common.path_utils as path_utils from multiprocessing.pool import ThreadPool import subprocess -from utils import create_manifest - -parser = argparse.ArgumentParser(description='Downloads and processes Mozilla Common Voice dataset.') -parser.add_argument("--target-dir", default='CommonVoice_dataset/', type=str, help="Directory to store the dataset.") -parser.add_argument("--tar-path", type=str, help="Path to the Common Voice *.tar file if downloaded (Optional).") -parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate') -parser.add_argument('--min-duration', default=1, type=int, - help='Prunes training samples shorter than the min duration (given in seconds, default 1)') -parser.add_argument('--max-duration', default=15, type=int, - help='Prunes training samples longer than the max duration (given in seconds, default 15)') -parser.add_argument('--files-to-process', default="cv-valid-dev.csv,cv-valid-test.csv,cv-valid-train.csv", - type=str, help='list of *.csv file names to process') -args = parser.parse_args() +from sonosco.datasets.download_datasets.data_utils import create_manifest +from sonosco.common.utils import setup_logging + +logger = logging.getLogger("sonosco") + COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz" +@click.command() +@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.") +@click.option("--sample-rate", default=16000, type=int, help="Sample rate.") +@click.option("--files-to-use", multiple=True, + default=["cv-valid-dev.csv","cv-valid-test.csv","cv-valid-train.csv"]) +@click.option("--min-duration", default=1, type=int, + help="Prunes training samples shorter than the min duration (given in seconds).") +@click.option("--max-duration", default=15, type=int, + help="Prunes training samples longer than the max duration (given in seconds).") -def convert_to_wav(csv_file, target_dir): +def convert_to_wav(csv_file, target_dir, sample_rate): """ Read *.csv file description, convert mp3 to wav, process text. Save results to target_dir. Args: @@ -30,8 +35,8 @@ def convert_to_wav(csv_file, target_dir): """ wav_dir = os.path.join(target_dir, 'wav/') txt_dir = os.path.join(target_dir, 'txt/') - os.makedirs(wav_dir, exist_ok=True) - os.makedirs(txt_dir, exist_ok=True) + path_utils.try_create_directory(wav_dir) + path_utils.try_create_directory(txt_dir) path_to_data = os.path.dirname(csv_file) def process(x): @@ -42,7 +47,7 @@ def process(x): f.write(text) cmd = "sox {} -r {} -b 16 -c 1 {}".format( os.path.join(path_to_data, file_path), - args.sample_rate, + sample_rate, os.path.join(wav_dir, file_name + '.wav')) subprocess.call([cmd], shell=True) @@ -54,36 +59,40 @@ def process(x): pool.map(process, data) -def main(): - target_dir = args.target_dir - os.makedirs(target_dir, exist_ok=True) +def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): + setup_logging(logger) + + path_to_data = os.path.join(os.path.expanduser("~"), target_dir) + path_utils.try_create_directory(path_to_data) - target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") - os.makedirs(target_unpacked_dir, exist_ok=True) + target_unpacked_dir = os.path.join(target_dir, "common_unpacked") + path_utils.try_create_directory(target_unpacked_dir) - if args.tar_path and os.path.exists(args.tar_path): - print('Find existing file {}'.format(args.tar_path)) - target_file = args.tar_path - else: - print("Could not find downloaded Common Voice archive, Downloading corpus...") - filename = wget.download(COMMON_VOICE_URL, target_dir) - target_file = os.path.join(target_dir, os.path.basename(filename)) + extracted_dir = os.path.join(path_to_data, "CommonVoice") + if os.path.exists(extracted_dir): + shutil.rmtree(extracted_dir) + + path_utils.try_download(target_unpacked_dir, COMMON_VOICE_URL) + + logger.info("Download complete") + logger.info("Unpacking...") print("Unpacking corpus to {} ...".format(target_unpacked_dir)) - tar = tarfile.open(target_file) - tar.extractall(target_unpacked_dir) + tar = tarfile.open(target_unpacked_dir) + tar.extractall(extracted_dir) tar.close() for csv_file in args.files_to_process.split(','): - convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file), - os.path.join(target_dir, os.path.splitext(csv_file)[0])) + convert_to_wav(os.path.join(extracted_dir, 'cv_corpus_v1/', csv_file), + os.path.join(target_dir, os.path.splitext(csv_file)[0]), + sample_rate) print('Creating manifests...') - for csv_file in args.files_to_process.split(','): + for csv_file in files_to_use.split(','): create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]), os.path.splitext(csv_file)[0] + '_manifest.csv', - args.min_duration, - args.max_duration) + min_duration, + max_duration) if __name__ == "__main__": From 2445d7133b6e7ade9d74e0edb231decc8349aa36 Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Thu, 6 Jun 2019 09:40:07 +0200 Subject: [PATCH 23/58] Removed pycandle, improved local install --- install_dependencies.sh | 14 ++++++++--- sonosco/datasets/AudioDataLoader.py | 5 ++-- sonosco/models/deepspeech2.py | 12 ++++------ sonosco/pycandle_train.py | 36 ----------------------------- 4 files changed, 17 insertions(+), 50 deletions(-) delete mode 100644 sonosco/pycandle_train.py diff --git a/install_dependencies.sh b/install_dependencies.sh index d4ad0a0..2be3b54 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -1,6 +1,10 @@ #!/usr/bin/env bash -#This scripts assumes that you have a virtual env in ./venv, you can override this by ./install_dependencies.sh -p /some/other/path + +# Running without arguments -> installing into virtual env located in ./venv +# -a= takes precedence before the virtual env and installs to conda env +# -e=/path/to/venv installs in different venv then ./venv +# -c=true installs with cuda support (default false) set -e @@ -47,7 +51,7 @@ else export CUDA_HOME="/usr/local/cuda" fi cd warp-ctc; mkdir build; cd build; cmake ..; make -cd ../pytorch_binding && python setup.py install +cd ../pytorch_binding && MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install cd ../.. rm -rf warp-ctc @@ -56,4 +60,8 @@ cd audio; MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py ins cd .. rm -rf audio -pip install -r post_requirements.txt \ No newline at end of file +pip install -r post_requirements.txt + +if [ -f ./src/pip-delete-this-directory.txt ]; then + rm -rf ./src/ +fi \ No newline at end of file diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/AudioDataLoader.py index d1dbccd..967b89c 100644 --- a/sonosco/datasets/AudioDataLoader.py +++ b/sonosco/datasets/AudioDataLoader.py @@ -28,7 +28,6 @@ def _collate_fn(self, batch): inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) target_sizes[x] = len(batch[x][1]) - targets.append([batch[x][1]]) + targets.extend(batch[x][1]) - # return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) - return (inputs, input_percentages), torch.IntTensor(targets) \ No newline at end of file + return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) \ No newline at end of file diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py index 5d2dff8..dcfa100 100644 --- a/sonosco/models/deepspeech2.py +++ b/sonosco/models/deepspeech2.py @@ -82,14 +82,13 @@ def forward(self, input_): class BatchRNN(nn.Module): - def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=True): + def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True): super(BatchRNN, self).__init__() - self.bidirectional = bidirectional self.input_size = input_size self.hidden_size = hidden_size self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, - bidirectional=self.bidirectional, bias=True) + bidirectional=True, bias=True) def flatten_parameters(self): self.rnn.flatten_parameters() @@ -156,12 +155,9 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5 self.inference_softmax = InferenceBatchSoftmax() - def forward(self, xx): + def forward(self, x, lengths): # if x.is_cuda and self.mixed_precision: # x = x.half() - x, input_percentages = xx - - lengths = input_percentages.mul_(int(x.size(3))).int() lengths = lengths.cpu().int() output_lengths = self.get_seq_lens(lengths) x, _ = self.conv(x, output_lengths) @@ -180,7 +176,7 @@ def forward(self, xx): x = x.transpose(0, 1) # identity in training mode, softmax in eval mode x = self.inference_softmax(x) - return x + return x, output_lengths def get_seq_lens(self, input_length): """ diff --git a/sonosco/pycandle_train.py b/sonosco/pycandle_train.py deleted file mode 100644 index 71ec2ad..0000000 --- a/sonosco/pycandle_train.py +++ /dev/null @@ -1,36 +0,0 @@ -import torch -import os -import torch.nn.functional as F -#import datasets.download_datasets.librispeech as librispeech - -from datasets.AudioDataLoader import AudioDataLoader -from datasets.AudioDataSampler import BucketingSampler -from datasets.AudioDataset import AudioDataset -from models.deepspeech2 import DeepSpeech2 -from pycandle.general.experiment import Experiment -from pycandle.training.model_trainer import ModelTrainer - -def load_datasets(manifest_path, batch_size_train, batch_size_test): - audio_conf = dict(sample_rate=16000, - window_size=.02, - window_stride=.01, - window='hamming') - labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=manifest_path, labels=labels, - normalize=False, augment=False) - print("Dataset is created\n====================\n") - - batch_size = 16 - sampler = BucketingSampler(test_dataset, batch_size=batch_size) - return AudioDataLoader(test_dataset, num_workers=4, batch_sampler=sampler) - - -manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech") -test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") -# librispeech.main() -model = DeepSpeech2().cpu() -experiment = Experiment('mnist_example') -train_loader = load_datasets(test_manifest, batch_size_train=64, batch_size_test=64) -optimizer = torch.optim.Adam(model.parameters(), lr=0.01) -model_trainer = ModelTrainer(model, optimizer, F.nll_loss, 20, train_loader, gpu=None) -model_trainer.start_training() From 7752aaf991f65dfb5a65343afc8b97cad7206aff Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Thu, 6 Jun 2019 11:29:17 +0200 Subject: [PATCH 24/58] refactoring --- sonosco/common/constants.py | 1 + sonosco/datasets/AudioDataset.py | 2 - .../datasets/download_datasets/librispeech.py | 80 ++++++++++--------- 3 files changed, 43 insertions(+), 40 deletions(-) create mode 100644 sonosco/common/constants.py diff --git a/sonosco/common/constants.py b/sonosco/common/constants.py new file mode 100644 index 0000000..3d06bc6 --- /dev/null +++ b/sonosco/common/constants.py @@ -0,0 +1 @@ +SONOSCO = "sonosco" \ No newline at end of file diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py index 6aea72a..0c271f7 100644 --- a/sonosco/datasets/AudioDataset.py +++ b/sonosco/datasets/AudioDataset.py @@ -66,8 +66,6 @@ def parse_audio(self, audio_path): win_length=int(self.sample_rate * self.window_size), window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1] - - if self.normalize: mean = spectrogram.mean() std = spectrogram.std() diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index 38b3f61..9c0e6e4 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -8,10 +8,11 @@ from sonosco.datasets.download_datasets.data_utils import create_manifest from sonosco.common.utils import setup_logging +from sonosco.common.constants import * from tqdm import tqdm -logger = logging.getLogger("sonosco") +logger = logging.getLogger(SONOSCO) LIBRI_SPEECH_URLS = { @@ -27,43 +28,7 @@ } -def _preprocess_transcript(phrase): - return phrase.strip().upper() - - -def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate): - full_recording_path = os.path.join(root_dir, base_filename) - assert os.path.exists(full_recording_path) and os.path.exists(root_dir) - wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav")) - audio_tools.transcode_recording(full_recording_path, wav_recording_path, sample_rate) - # process transcript - txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt")) - transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt") - assert os.path.exists(transcript_file), f"Transcript file {transcript_file} does not exist" - transcriptions = open(transcript_file).read().strip().split("\n") - transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions} - with open(txt_transcript_path, "w") as f: - key = base_filename.replace(".flac", "").split("-")[-1] - assert key in transcriptions, f"{key} is not in the transcriptions" - f.write(_preprocess_transcript(transcriptions[key])) - f.flush() - - -@click.command() -@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.") -@click.option("--sample-rate", default=16000, type=int, help="Sample rate.") -@click.option("--files-to-use", multiple=True, - default=["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz", - "dev-clean.tar.gz", "dev-other.tar.gz", "test-clean.tar.gz", "test-other.tar.gz"], - type=str, help="List of file names to download.") -@click.option("--min-duration", default=1, type=int, - help="Prunes training samples shorter than the min duration (given in seconds).") -@click.option("--max-duration", default=15, type=int, - help="Prunes training samples longer than the max duration (given in seconds).") -def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): - """Processes and downloads LibriSpeech dataset.""" - setup_logging(logger) - +def try_download_librispeech(target_dir, sample_rate, files_to_use, min_duration, max_duration): path_to_data = os.path.join(os.path.expanduser("~"), target_dir) if not os.path.exists(path_to_data): os.makedirs(path_to_data) @@ -121,5 +86,44 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): create_manifest(split_dir, manifest_path) +def _preprocess_transcript(phrase): + return phrase.strip().upper() + + +def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate): + full_recording_path = os.path.join(root_dir, base_filename) + assert os.path.exists(full_recording_path) and os.path.exists(root_dir) + wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav")) + audio_tools.transcode_recording(full_recording_path, wav_recording_path, sample_rate) + # process transcript + txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt")) + transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt") + assert os.path.exists(transcript_file), f"Transcript file {transcript_file} does not exist" + transcriptions = open(transcript_file).read().strip().split("\n") + transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions} + with open(txt_transcript_path, "w") as f: + key = base_filename.replace(".flac", "").split("-")[-1] + assert key in transcriptions, f"{key} is not in the transcriptions" + f.write(_preprocess_transcript(transcriptions[key])) + f.flush() + + +@click.command() +@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.") +@click.option("--sample-rate", default=16000, type=int, help="Sample rate.") +@click.option("--files-to-use", multiple=True, + default=["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz", + "dev-clean.tar.gz", "dev-other.tar.gz", "test-clean.tar.gz", "test-other.tar.gz"], + type=str, help="List of file names to download.") +@click.option("--min-duration", default=1, type=int, + help="Prunes training samples shorter than the min duration (given in seconds).") +@click.option("--max-duration", default=15, type=int, + help="Prunes training samples longer than the max duration (given in seconds).") +def main(**kwargs): + """Processes and downloads LibriSpeech dataset.""" + setup_logging(logger) + try_download_librispeech(**kwargs) + + if __name__ == "__main__": main() From 3968b613b92a069412b00e1b3f51fd653d51cc11 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Thu, 6 Jun 2019 12:46:55 +0200 Subject: [PATCH 25/58] start refactoring audio dataset --- sonosco/config/__init__.py | 0 sonosco/config/global_settings.py | 1 + sonosco/datasets/AudioDataset.py | 107 ++++++++++-------- .../datasets/download_datasets/data_utils.py | 1 - .../datasets/download_datasets/librispeech.py | 4 +- 5 files changed, 63 insertions(+), 50 deletions(-) create mode 100644 sonosco/config/__init__.py create mode 100644 sonosco/config/global_settings.py diff --git a/sonosco/config/__init__.py b/sonosco/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/config/global_settings.py b/sonosco/config/global_settings.py new file mode 100644 index 0000000..2f2522a --- /dev/null +++ b/sonosco/config/global_settings.py @@ -0,0 +1 @@ +CUDA_ENABLED = False \ No newline at end of file diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/AudioDataset.py index 0c271f7..098d195 100644 --- a/sonosco/datasets/AudioDataset.py +++ b/sonosco/datasets/AudioDataset.py @@ -5,39 +5,50 @@ import warnings import os -from typing import Tuple - +import logging import torch import torchaudio -from scipy import signal +import sonosco.config.global_settings as global_settings + +from typing import Tuple from torch.utils.data import Dataset +from sonosco.common.utils import setup_logging +from sonosco.common.constants import * -windows = {"bartlett": torch.bartlett_window, - "blackman": torch.blackman_window, - "hamming": torch.hamming_window, - "hann": torch.hann_window} +logger = logging.getLogger(__name__) -class DataProcessor(object): - def __init__(self, audio_conf, labels="abc", normalize=False, augment=False): + +class DataProcessor: + + def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False): """ Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by a comma. Each new line is a different sample. Example below: /path/to/audio.wav,/path/to/audio.txt ... - :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds - :param labels: String containing all the possible characters to map to - :param normalize: Apply standard mean and deviation normalization to audio tensor - :param augment(default False): Apply random tempo and gain perturbations + :param window_stride: number of seconds to skip between each window + :param window_size: number of seconds to use for a window of spectrogram + :param sample_rate: sample rate of the recordings + :param labels: string containing all the possible characters to map to + :param normalize: apply standard mean and deviation normalization to audio tensor + :param augment(default False): apply random tempo and gain perturbations """ + self.window_stride = window_stride + self.window_size = window_size + self.sample_rate = sample_rate self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) - self.window_stride = audio_conf["window_stride"] - self.window_size = audio_conf["window_size"] - self.sample_rate = audio_conf["sample_rate"] - self.window = windows.get(audio_conf["window"], windows["hamming"]) self.normalize = normalize self.augment = augment + @property + def window_stride_samples(self): + return int(self.sample_rate * self.window_stride) + + @property + def window_size_samples(self): + return int(self.sample_rate * self.window_stride) + @staticmethod def retrieve_file(audio_path): sound, sample_rate = torchaudio.load(audio_path) @@ -45,57 +56,52 @@ def retrieve_file(audio_path): @staticmethod def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)): - """ - Changes tempo and gain of the wave - """ + """Changes tempo and gain of the wave.""" warnings.warn("Augmentation is not implemented") # TODO: Implement return sound def parse_audio(self, audio_path): sound, sample_rate = self.retrieve_file(audio_path) + if sample_rate != self.sample_rate: raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") if self.augment: sound = self.augment_audio(sound) - #sound = sound.cuda() - spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()), - n_fft=int(self.sample_rate * self.window_size), - hop_length=int(self.sample_rate * self.window_stride), - win_length=int(self.sample_rate * self.window_size), - window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1] + if global_settings.CUDA_ENABLED: + sound = sound.cuda() - if self.normalize: - mean = spectrogram.mean() - std = spectrogram.std() - spectrogram.add_(-mean) - spectrogram.div_(std) + # TODO: comment why take the last element? + spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()), + n_fft=self.window_size_samples, + hop_length=self.window_stride_samples, + win_length=self.window_size_samples, + window=torch.hamming_window(self.window_size_samples), + normalized=self.normalize)[:, :, -1] return spectrogram def parse_transcript(self, transcript_path): with open(transcript_path, 'r', encoding='utf8') as transcript_file: transcript = transcript_file.read().replace('\n', '') - print(f"1: {transcript}") + logger.info(f"1: {transcript}") # TODO: Is it fast enough? transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) - print(f"transcript_path: {transcript_path}\ntranscript: {transcript}") + logger.info(f"transcript_path: {transcript_path} transcript: {transcript}") return transcript class AudioDataset(Dataset): - def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False): + + def __init__(self, processor: DataProcessor, manifest_filepath): """ Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by a comma. Each new line is a different sample. Example below: /path/to/audio.wav,/path/to/audio.txt ... - :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds + :param processor: Data processor object :param manifest_filepath: Path to manifest csv as describe above - :param labels: String containing all the possible characters to map to - :param normalize: Apply standard mean and deviation normalization to audio tensor - :param augment(default False): Apply random tempo and gain perturbations """ super(AudioDataset, self).__init__() with open(manifest_filepath) as f: @@ -103,7 +109,7 @@ def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augme ids = [x.strip().split(',') for x in ids] self.ids = ids self.size = len(ids) - self.processor = DataProcessor(audio_conf, labels, normalize, augment) + self.processor = processor def __getitem__(self, index): sample = self.ids[index] @@ -119,17 +125,22 @@ def __len__(self): def main(): - audio_conf = dict(sample_rate=16000, - window_size=.02, - window_stride=.01, - window='hamming') + global logger + logger = logging.getLogger(SONOSCO) + setup_logging(logger) + + # create data processor + audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, + labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False) + processor = DataProcessor(**audio_conf) + + # get manifest file manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech") test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") - labels = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels, - normalize=False, augment=False) - print("Dataset is created\n====================\n") + # create audio dataset + test_dataset = AudioDataset(processor, manifest_filepath=test_manifest) + logger.info("Dataset is created") test = test_dataset[0] batch_size = 16 sampler = BucketingSampler(test_dataset, batch_size=batch_size) @@ -139,4 +150,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py index e6a5cd8..e7e15d5 100644 --- a/sonosco/datasets/download_datasets/data_utils.py +++ b/sonosco/datasets/download_datasets/data_utils.py @@ -7,7 +7,6 @@ from tqdm import tqdm -import pdb; pdb.set_trace() logger = logging.getLogger(__name__) diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index 9c0e6e4..59df937 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -12,7 +12,7 @@ from tqdm import tqdm -logger = logging.getLogger(SONOSCO) +logger = logging.getLogger(__name__) LIBRI_SPEECH_URLS = { @@ -121,6 +121,8 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate): help="Prunes training samples longer than the max duration (given in seconds).") def main(**kwargs): """Processes and downloads LibriSpeech dataset.""" + global logger + logger = logging.getLogger(SONOSCO) setup_logging(logger) try_download_librispeech(**kwargs) From 028f1aeda36d60b9cf3921928dd34ed379b0b179 Mon Sep 17 00:00:00 2001 From: ga38nif Date: Tue, 11 Jun 2019 17:49:27 +0200 Subject: [PATCH 26/58] adapt common_voice to click logger [Needs test with good internet connectivity] --- .../download_datasets/common_voice.py | 82 ++++++++++--------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py index 70451ab..e038dab 100644 --- a/sonosco/datasets/download_datasets/common_voice.py +++ b/sonosco/datasets/download_datasets/common_voice.py @@ -11,13 +11,14 @@ import subprocess from sonosco.datasets.download_datasets.data_utils import create_manifest from sonosco.common.utils import setup_logging +from sonosco.common.constants import * -logger = logging.getLogger("sonosco") +logger = logging.getLogger(__name__) COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz" @click.command() -@click.option("--target-dir", default="temp/data/libri_speech", type=str, help="Directory to store the dataset.") +@click.option("--target-dir", default="temp/data/common_voice", type=str, help="Directory to store the dataset.") @click.option("--sample-rate", default=16000, type=int, help="Sample rate.") @click.option("--files-to-use", multiple=True, default=["cv-valid-dev.csv","cv-valid-test.csv","cv-valid-train.csv"]) @@ -25,6 +26,44 @@ help="Prunes training samples shorter than the min duration (given in seconds).") @click.option("--max-duration", default=15, type=int, help="Prunes training samples longer than the max duration (given in seconds).") +def try_download_common_voice(target_dir, sample_rate, files_to_use, min_duration, max_duration): + path_to_data = os.path.join(os.path.expanduser("~"), target_dir) + path_utils.try_create_directory(path_to_data) + + target_unpacked_dir = os.path.join(path_to_data, "common_unpacked") + path_utils.try_create_directory(target_unpacked_dir) + + extracted_dir = os.path.join(path_to_data, "CommonVoice") + if os.path.exists(extracted_dir): + shutil.rmtree(extracted_dir) + logger.info("Start downloading...") + file_name = COMMON_VOICE_URL.split("/")[-1] + target_filename = os.path.join(target_unpacked_dir, file_name) + path_utils.try_download(target_filename, COMMON_VOICE_URL) + + logger.info("Download complete") + logger.info("Unpacking...") + tar = tarfile.open(target_filename) + tar.extractall(extracted_dir) + tar.close() + os.remove(target_unpacked_dir) + assert os.path.exists(extracted_dir), f"Archive {file_name} was not properly uncompressed" + logger.info("Converting files to wav and extracting transcripts...") + for csv_file in files_to_use.split(','): + convert_to_wav(os.path.join(extracted_dir, 'cv_corpus_v1/', csv_file), + os.path.join(target_dir, os.path.splitext(csv_file)[0]), + sample_rate) + logger.info(f"Finished {COMMON_VOICE_URL}") + shutil.rmtree(extracted_dir) + + logger.info('Creating manifests...') + for csv_file in files_to_use.split(','): + create_manifest(os.path.join(path_to_data, os.path.splitext(csv_file)[0]), + os.path.splitext(csv_file)[0] + '_manifest.csv', + min_duration, + max_duration) + + def convert_to_wav(csv_file, target_dir, sample_rate): """ Read *.csv file description, convert mp3 to wav, process text. @@ -51,48 +90,17 @@ def process(x): os.path.join(wav_dir, file_name + '.wav')) subprocess.call([cmd], shell=True) - print('Converting mp3 to wav for {}.'.format(csv_file)) + logger.ino('Converting mp3 to wav for {}.'.format(csv_file)) with open(csv_file) as csvfile: reader = csv.DictReader(csvfile) data = [(row['filename'], row['text']) for row in reader] with ThreadPool(10) as pool: pool.map(process, data) - -def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): +def main(**kwargs): + logger = logging.getLogger(SONOSCO) setup_logging(logger) - - path_to_data = os.path.join(os.path.expanduser("~"), target_dir) - path_utils.try_create_directory(path_to_data) - - target_unpacked_dir = os.path.join(target_dir, "common_unpacked") - path_utils.try_create_directory(target_unpacked_dir) - - extracted_dir = os.path.join(path_to_data, "CommonVoice") - if os.path.exists(extracted_dir): - shutil.rmtree(extracted_dir) - - path_utils.try_download(target_unpacked_dir, COMMON_VOICE_URL) - - logger.info("Download complete") - logger.info("Unpacking...") - - print("Unpacking corpus to {} ...".format(target_unpacked_dir)) - tar = tarfile.open(target_unpacked_dir) - tar.extractall(extracted_dir) - tar.close() - - for csv_file in args.files_to_process.split(','): - convert_to_wav(os.path.join(extracted_dir, 'cv_corpus_v1/', csv_file), - os.path.join(target_dir, os.path.splitext(csv_file)[0]), - sample_rate) - - print('Creating manifests...') - for csv_file in files_to_use.split(','): - create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]), - os.path.splitext(csv_file)[0] + '_manifest.csv', - min_duration, - max_duration) + try_download_common_voice(**kwargs) if __name__ == "__main__": From b0ffed4a378bcbe1d522ba6ec2f4f8e7bcf50b3e Mon Sep 17 00:00:00 2001 From: ga38nif Date: Wed, 12 Jun 2019 09:16:24 +0200 Subject: [PATCH 27/58] add function for an4 dataset to audio_tools --- sonosco/common/audio_tools.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py index 21c811b..77befb6 100644 --- a/sonosco/common/audio_tools.py +++ b/sonosco/common/audio_tools.py @@ -7,3 +7,6 @@ def get_duration(file_path): def transcode_recording(source, destination, sample_rate): subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination}"], shell=True) + +def transcode_recordings_an4(raw_path, wav_path, sample_rate): + subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True) \ No newline at end of file From 52a24cd2537029eda7817ad65a4c4fb2f9ce59bf Mon Sep 17 00:00:00 2001 From: ga38nif Date: Wed, 12 Jun 2019 09:17:09 +0200 Subject: [PATCH 28/58] add audio_tools call to common voice --- sonosco/datasets/download_datasets/common_voice.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py index e038dab..7ea1eff 100644 --- a/sonosco/datasets/download_datasets/common_voice.py +++ b/sonosco/datasets/download_datasets/common_voice.py @@ -84,11 +84,7 @@ def process(x): text = text.strip().upper() with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f: f.write(text) - cmd = "sox {} -r {} -b 16 -c 1 {}".format( - os.path.join(path_to_data, file_path), - sample_rate, - os.path.join(wav_dir, file_name + '.wav')) - subprocess.call([cmd], shell=True) + audio_tools(source = os.path.join(path_to_data, file_path), destination=os.path.join(wav_dir, file_name + '.wav'), sample_rate = sample_rate) logger.ino('Converting mp3 to wav for {}.'.format(csv_file)) with open(csv_file) as csvfile: From d3796279c59883d4002ce14dd5cdf078775841a5 Mon Sep 17 00:00:00 2001 From: ga38nif Date: Wed, 12 Jun 2019 09:18:00 +0200 Subject: [PATCH 29/58] adapt an4 to new style and make it work. --- sonosco/datasets/download_datasets/an4.py | 133 +++++++++++++--------- 1 file changed, 77 insertions(+), 56 deletions(-) diff --git a/sonosco/datasets/download_datasets/an4.py b/sonosco/datasets/download_datasets/an4.py index f810ee0..a36f952 100644 --- a/sonosco/datasets/download_datasets/an4.py +++ b/sonosco/datasets/download_datasets/an4.py @@ -1,60 +1,84 @@ -import argparse import os +import click import io import shutil import tarfile -import wget - -from utils import create_manifest - -parser = argparse.ArgumentParser(description='Processes and downloads an4.') -parser.add_argument('--target-dir', default='an4_dataset/', help='Path to save dataset') -parser.add_argument('--min-duration', default=1, type=int, - help='Prunes training samples shorter than the min duration (given in seconds, default 1)') -parser.add_argument('--max-duration', default=15, type=int, - help='Prunes training samples longer than the max duration (given in seconds, default 15)') -args = parser.parse_args() - - -def _format_data(root_path, data_tag, name, wav_folder): - data_path = args.target_dir + data_tag + '/' + name + '/' - new_transcript_path = data_path + '/txt/' - new_wav_path = data_path + '/wav/' - - os.makedirs(new_transcript_path) - os.makedirs(new_wav_path) - - wav_path = root_path + 'wav/' - file_ids = root_path + 'etc/an4_%s.fileids' % data_tag - transcripts = root_path + 'etc/an4_%s.transcription' % data_tag - train_path = wav_path + wav_folder - - _convert_audio_to_wav(train_path) - _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path) - - -def _convert_audio_to_wav(train_path): +import logging +import sonosco.common.audio_tools as audio_tools +import sonosco.common.path_utils as path_utils + +from sonosco.datasets.download_datasets.data_utils import create_manifest +from sonosco.common.utils import setup_logging +from sonosco.common.constants import * + +logger = logging.getLogger(__name__) + +AN4_URL = 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz' + +def try_download_an4(target_dir, sample_rate, min_duration, max_duration): + path_to_data = os.path.join(os.path.expanduser("~"), target_dir) + if not os.path.exists(path_to_data): + os.makedirs(path_to_data) + target_unpacked_dir = os.path.join(path_to_data, "an4_unpacked") + path_utils.try_create_directory(target_unpacked_dir) + + extracted_dir = os.path.join(path_to_data, "An4") + if os.path.exists(extracted_dir): + shutil.rmtree(extracted_dir) + logger.info("Start downloading...") + file_name = AN4_URL.split("/")[-1] + + target_filename = os.path.join(target_unpacked_dir, file_name) + path_utils.try_download(target_filename, AN4_URL) + logger.info("Download complete") + logger.info("Unpacking...") + tar = tarfile.open(target_filename) + tar.extractall(extracted_dir) + tar.close() + assert os.path.exists(extracted_dir), f"Archive {file_name} was not properly uncompressed" + logger.info("Converting files to wav and extracting transcripts...") + + create_wav_and_transcripts(path_to_data, 'train', sample_rate, extracted_dir, 'an4_clstk') + create_wav_and_transcripts(path_to_data, 'test', sample_rate, extracted_dir, 'an4test_clstk') + + create_manifest(path_to_data, os.path.join(path_to_data,'an4_train_manifest.csv'), min_duration, max_duration) + create_manifest(path_to_data, os.path.join(path_to_data,'an4_val_manifest.csv'), min_duration, max_duration) + +def create_wav_and_transcripts(path, data_tag, sample_rate, extracted_dir, wav_subfolder_name): + tag_path = os.path.join(path,data_tag) + transcript_path_new = os.path.join(tag_path, 'txt') + wav_path_new = os.path.join(tag_path, 'wav') + + path_utils.try_create_directory(transcript_path_new) + path_utils.try_create_directory(wav_path_new) + + wav_path_ext = os.path.join(extracted_dir, 'an4/wav') + file_ids = os.path.join(extracted_dir, f'an4/etc/an4_{data_tag}.fileids') + transcripts_ext = os.path.join(extracted_dir, f'an4/etc/an4_{data_tag}.transcription') + path = os.path.join(wav_path_ext, wav_subfolder_name) + convert_audio_to_wav(path, sample_rate) + format_files(file_ids, transcript_path_new, wav_path_new, transcripts_ext, wav_path_ext) + +def convert_audio_to_wav(train_path, sample_rate): with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe: for line in pipe: raw_path = line.strip() new_path = line.replace('.raw', '.wav').strip() - cmd = 'sox -t raw -r %d -b 16 -e signed-integer -B -c 1 \"%s\" \"%s\"' % ( - 16000, raw_path, new_path) - os.system(cmd) + audio_tools.transcode_recordings_an4(raw_path=raw_path, wav_path= new_path, sample_rate=sample_rate) -def _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path): +def format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path): with open(file_ids, 'r') as f: with open(transcripts, 'r') as t: paths = f.readlines() transcripts = t.readlines() for x in range(len(paths)): - path = wav_path + paths[x].strip() + '.wav' + path = os.path.join(wav_path, paths[x].strip()) + '.wav' filename = path.split('/')[-1] extracted_transcript = _process_transcript(transcripts, x) current_path = os.path.abspath(path) - new_path = new_wav_path + filename - text_path = new_transcript_path + filename.replace('.wav', '.txt') + new_path = os.path.join(new_wav_path ,filename) + text_path = os.path.join(new_transcript_path,filename.replace('.wav', '.txt')) with io.FileIO(text_path, "w") as file: file.write(extracted_transcript.encode('utf-8')) os.rename(current_path, new_path) @@ -64,23 +88,20 @@ def _process_transcript(transcripts, x): extracted_transcript = transcripts[x].split('(')[0].strip("").split('<')[0].strip().upper() return extracted_transcript - -def main(): - root_path = 'an4/' - name = 'an4' - wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz') - tar = tarfile.open('an4_raw.bigendian.tar.gz') - tar.extractall() - os.makedirs(args.target_dir) - _format_data(root_path, 'train', name, 'an4_clstk') - _format_data(root_path, 'test', name, 'an4test_clstk') - shutil.rmtree(root_path) - os.remove('an4_raw.bigendian.tar.gz') - train_path = args.target_dir + '/train/' - test_path = args.target_dir + '/test/' - print ('\n', 'Creating manifests...') - create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration) - create_manifest(test_path, 'an4_val_manifest.csv') +@click.command() +@click.option("--target-dir", default="temp/data/an4", type=str, help="Directory to store the dataset.") +@click.option("--sample-rate", default=16000, type=int, help="Sample rate.") +@click.option("--min-duration", default=1, type=int, + help="Prunes training samples shorter than the min duration (given in seconds).") +@click.option("--max-duration", default=15, type=int, + help="Prunes training samples longer than the max duration (given in seconds).") + +def main(**kwargs): + """Processes and downloads an4 dataset.""" + global logger + logger = logging.getLogger(SONOSCO) + setup_logging(logger) + try_download_an4(**kwargs) if __name__ == '__main__': From b93f7ee154221e23ff9201f955e8aed3b33bc5ce Mon Sep 17 00:00:00 2001 From: ga38nif Date: Wed, 12 Jun 2019 11:14:21 +0200 Subject: [PATCH 30/58] add global logger --- sonosco/datasets/download_datasets/common_voice.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py index 7ea1eff..1c66e4f 100644 --- a/sonosco/datasets/download_datasets/common_voice.py +++ b/sonosco/datasets/download_datasets/common_voice.py @@ -94,6 +94,7 @@ def process(x): pool.map(process, data) def main(**kwargs): + global logger logger = logging.getLogger(SONOSCO) setup_logging(logger) try_download_common_voice(**kwargs) From 31ae0266730d9d99f4ebb323fe24e4404e968513 Mon Sep 17 00:00:00 2001 From: ga38nif Date: Wed, 12 Jun 2019 11:14:44 +0200 Subject: [PATCH 31/58] add transcription function for ted3 --- sonosco/common/audio_tools.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py index 77befb6..cd7a2bc 100644 --- a/sonosco/common/audio_tools.py +++ b/sonosco/common/audio_tools.py @@ -9,4 +9,7 @@ def transcode_recording(source, destination, sample_rate): subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination}"], shell=True) def transcode_recordings_an4(raw_path, wav_path, sample_rate): - subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True) \ No newline at end of file + subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True) + +def transcode_recordings_ted3(source, destination, start_time, end_time, sample_rate): + subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True) \ No newline at end of file From 459eef404b716b759ab142a23b8547eaa0117efc Mon Sep 17 00:00:00 2001 From: ga38nif Date: Wed, 12 Jun 2019 11:15:09 +0200 Subject: [PATCH 32/58] adapt download script for ted3 to click and new scheme --- sonosco/datasets/download_datasets/ted3.py | 109 +++++++++++---------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/sonosco/datasets/download_datasets/ted3.py b/sonosco/datasets/download_datasets/ted3.py index c7d4b3e..b50cdd6 100644 --- a/sonosco/datasets/download_datasets/ted3.py +++ b/sonosco/datasets/download_datasets/ted3.py @@ -1,25 +1,49 @@ import os -import wget -import tarfile +import click +import logging import argparse import subprocess import unicodedata +import tarfile import io -from utils import create_manifest +import shutil +import sonosco.common.audio_tools as audio_tools +import sonosco.common.path_utils as path_utils +from sonosco.datasets.download_datasets.data_utils import create_manifest +from sonosco.common.utils import setup_logging +from sonosco.common.constants import * from tqdm import tqdm -parser = argparse.ArgumentParser(description='Processes and downloads TED-LIUMv3 dataset.') -parser.add_argument("--target-dir", default='TEDLIUM3_dataset/', type=str, help="Directory to store the dataset.") -parser.add_argument("--tar-path", type=str, help="Path to the TEDLIUM_release tar if downloaded (Optional).") -parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate') -parser.add_argument('--min-duration', default=1, type=int, - help='Prunes training samples shorter than the min duration (given in seconds, default 1)') -parser.add_argument('--max-duration', default=15, type=int, - help='Prunes training samples longer than the max duration (given in seconds, default 15)') -args = parser.parse_args() +logger = logging.getLogger(__name__) TED_LIUM_V2_DL_URL = "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz" +def try_download_ted3(target_dir, sample_rate, min_duration, max_duration): + path_to_data = os.path.join(os.path.expanduser("~"), target_dir) + path_utils.try_create_directory(path_to_data) + + target_unpacked_dir = os.path.join(path_to_data, "ted3_unpacked") + path_utils.try_create_directory(target_unpacked_dir) + + extracted_dir = os.path.join(path_to_data, "Ted3") + if os.path.exists(extracted_dir): + shutil.rmtree(extracted_dir) + logger.info("Start downloading...") + file_name = TED_LIUM_V2_DL_URL.split("/")[-1] + target_filename = os.path.join(target_unpacked_dir, file_name) + path_utils.try_download(target_filename, TED_LIUM_V2_DL_URL) + + logger.info("Download complete") + logger.info("Unpacking...") + tar = tarfile.open(target_filename) + tar.extractall(extracted_dir) + tar.close() + os.remove(target_unpacked_dir) + assert os.path.exists(extracted_dir), f"Archive {file_name} was not properly uncompressed" + logger.info("Converting files to wav and extracting transcripts...") + prepare_dir(path_to_data, sample_rate) + create_manifest(path_to_data, os.path.join(path_to_data,'ted3_train_manifest.csv'), min_duration, max_duration) + def get_utterances_from_stm(stm_file): """ @@ -45,12 +69,6 @@ def get_utterances_from_stm(stm_file): return res -def cut_utterance(src_sph_file, target_wav_file, start_time, end_time, sample_rate=16000): - subprocess.call(["sox {} -r {} -b 16 -c 1 {} trim {} ={}".format(src_sph_file, str(sample_rate), - target_wav_file, start_time, end_time)], - shell=True) - - def _preprocess_transcript(phrase): return phrase.strip().upper() @@ -59,15 +77,12 @@ def filter_short_utterances(utterance_info, min_len_sec=1.0): return utterance_info["end_time"] - utterance_info["start_time"] > min_len_sec -def prepare_dir(ted_dir): - converted_dir = os.path.join(ted_dir, "converted") +def prepare_dir(ted_dir, sample_rate): # directories to store converted wav files and their transcriptions - wav_dir = os.path.join(converted_dir, "wav") - if not os.path.exists(wav_dir): - os.makedirs(wav_dir) - txt_dir = os.path.join(converted_dir, "txt") - if not os.path.exists(txt_dir): - os.makedirs(txt_dir) + wav_dir = os.path.join(ted_dir, "wav") + path_utils.try_create_directory(wav_dir) + txt_dir = os.path.join(ted_dir, "txt") + path_utils.try_create_directory(txt_dir) counter = 0 entries = os.listdir(os.path.join(ted_dir, "sph")) for sph_file in tqdm(entries, total=len(entries)): @@ -83,41 +98,27 @@ def prepare_dir(ted_dir): for utterance_id, utterance in enumerate(all_utterances): target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(utterance["filename"], str(utterance_id))) target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(utterance["filename"], str(utterance_id))) - cut_utterance(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"], - sample_rate=args.sample_rate) + audio_tools.transcode_recordings_ted3(sph_file_full, target_wav_file, utterance["start_time"], utterance["end_time"], + sample_rate=sample_rate) with io.FileIO(target_txt_file, "w") as f: f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8')) counter += 1 +@click.command() +@click.option("--target-dir", default="temp/data/ted3", type=str, help="Directory to store the dataset.") +@click.option("--sample-rate", default=16000, type=int, help="Sample rate.") -def main(): - target_dl_dir = args.target_dir - #if not os.path.exists(target_dl_dir): - # os.makedirs(target_dl_dir) - - target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release-3") - #if args.tar_path and os.path.exists(args.tar_path): - # target_file = args.tar_path - #else: - # print("Could not find downloaded TEDLIUM archive, Downloading corpus...") - # wget.download(TED_LIUM_V2_DL_URL, target_dl_dir) - # target_file = os.path.join(target_dl_dir, "TEDLIUM_release-3.tgz") - #if not os.path.exists(target_unpacked_dir): - # print("Unpacking corpus...") - # tar = tarfile.open(target_file) - # tar.extractall(target_dl_dir) - # tar.close() - #else: - # print("Found TEDLIUM directory, skipping unpacking of tar files") - - train_ted_dir = os.path.join(target_unpacked_dir, "data") - train_ted_dir = os.path.join(train_ted_dir, "converted") - - #prepare_dir(train_ted_dir) - print('Creating manifests...') +@click.option("--min-duration", default=1, type=int, + help="Prunes training samples shorter than the min duration (given in seconds).") +@click.option("--max-duration", default=15, type=int, + help="Prunes training samples longer than the max duration (given in seconds).") - create_manifest(train_ted_dir, 'ted3_train_manifest.csv', args.min_duration, args.max_duration) +def main(**kwargs): + global logger + logger = logging.getLogger(SONOSCO) + setup_logging(logger) + try_download_ted3(**kwargs) if __name__ == "__main__": main() From 4ac59e99b396e6e2730d4568d45547d81ee75880 Mon Sep 17 00:00:00 2001 From: ga38nif Date: Wed, 12 Jun 2019 11:32:45 +0200 Subject: [PATCH 33/58] adapt voxforge to click and adapt to datapaths --- .../datasets/download_datasets/voxforge.py | 71 +++++++++++-------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/sonosco/datasets/download_datasets/voxforge.py b/sonosco/datasets/download_datasets/voxforge.py index a31febf..6317354 100644 --- a/sonosco/datasets/download_datasets/voxforge.py +++ b/sonosco/datasets/download_datasets/voxforge.py @@ -1,4 +1,6 @@ import os +import click +import logging from six.moves import urllib import argparse import re @@ -7,22 +9,29 @@ import subprocess import tarfile import io +import sonosco.common.audio_tools as audio_tools +import sonosco.common.path_utils as path_utils +from sonosco.datasets.download_datasets.data_utils import create_manifest +from sonosco.common.utils import setup_logging +from sonosco.common.constants import * from tqdm import tqdm -from utils import create_manifest +logger = logging.getLogger(__name__) VOXFORGE_URL_16kHz = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/' -parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.') -parser.add_argument("--target-dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.") -parser.add_argument('--sample-rate', default=16000, - type=int, help='Sample rate') -parser.add_argument('--min-duration', default=1, type=int, - help='Prunes training samples shorter than the min duration (given in seconds, default 1)') -parser.add_argument('--max-duration', default=15, type=int, - help='Prunes training samples longer than the max duration (given in seconds, default 15)') -args = parser.parse_args() +def try_download_voxforge(target_dir, sample_rate, min_duration, max_duration): + path_to_data = os.path.join(os.path.expanduser("~"), target_dir) + path_utils.try_create_directory(path_to_data) + logger.info("Start downloading...") + request = urllib.request.Request(VOXFORGE_URL_16kHz) + response = urllib.request.urlopen(request) + content = response.read() + all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8")) + for f in tqdm(all_files, total=len(all_files)): + prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, path_to_data, sample_rate) + create_manifest(path_to_data, os.path.join(path_to_data,'voxforge_train_manifest.csv'), min_duration, max_duration) def _get_recordings_dir(sample_dir, recording_name): wav_dir = os.path.join(sample_dir, recording_name, "wav") @@ -34,16 +43,14 @@ def _get_recordings_dir(sample_dir, recording_name): raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name)) -def prepare_sample(recording_name, url, target_folder): +def prepare_sample(recording_name, url, target_folder, sample_rate): """ Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder. """ wav_dir = os.path.join(target_folder, "wav") - if not os.path.exists(wav_dir): - os.makedirs(wav_dir) + path_utils.try_create_directory(wav_dir) txt_dir = os.path.join(target_folder, "txt") - if not os.path.exists(txt_dir): - os.makedirs(txt_dir) + path_utils.try_create_directory(txt_dir) # check if sample is processed filename_set = set(['_'.join(wav_file.split('_')[:-1]) for wav_file in os.listdir(wav_dir)]) if recording_name in filename_set: @@ -80,23 +87,27 @@ def prepare_sample(recording_name, url, target_folder): with io.FileIO(target_txt_file, "w") as file: file.write(utterance.encode('utf-8')) original_wav_file = os.path.join(recordings_dir, wav_file) - subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate), - target_wav_file)], shell=True) + audio_tools.transcode_recording(original_wav_file, target_wav_file, sample_rate) shutil.rmtree(dirpath) +@click.command() +@click.option("--target-dir", default="temp/data/voxforge", type=str, help="Directory to store the dataset.") +@click.option("--sample-rate", default=16000, type=int, help="Sample rate.") -if __name__ == '__main__': - target_dir = args.target_dir - sample_rate = args.sample_rate +@click.option("--min-duration", default=1, type=int, + help="Prunes training samples shorter than the min duration (given in seconds).") +@click.option("--max-duration", default=15, type=int, + help="Prunes training samples longer than the max duration (given in seconds).") - if not os.path.isdir(target_dir): - os.makedirs(target_dir) - request = urllib.request.Request(VOXFORGE_URL_16kHz) - response = urllib.request.urlopen(request) - content = response.read() - all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8")) - for f in tqdm(all_files, total=len(all_files)): - prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir) - print('Creating manifests...') - create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration) \ No newline at end of file + + +def main(**kwargs): + global logger + logger = logging.getLogger(SONOSCO) + setup_logging(logger) + try_download_voxforge(**kwargs) + + +if __name__ == '__main__': + main() \ No newline at end of file From 4cefe935ae67e8c492569208a2230a09601f00a7 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Wed, 12 Jun 2019 18:22:33 +0200 Subject: [PATCH 34/58] add tests --- requirements.txt | 1 + sonosco/common/click_extensions.py | 4 +- ...udioDataLoader.py => audio_data_loader.py} | 0 ...ioDataSampler.py => audio_data_sampler.py} | 0 .../{AudioDataset.py => audio_dataset.py} | 37 ++----------- .../download_datasets/common_voice.py | 8 +-- .../datasets/download_datasets/data_utils.py | 10 ++-- .../datasets/download_datasets/librispeech.py | 19 +++---- tests/test_dataset.py | 53 +++++++++++++++++++ 9 files changed, 79 insertions(+), 53 deletions(-) rename sonosco/datasets/{AudioDataLoader.py => audio_data_loader.py} (100%) rename sonosco/datasets/{AudioDataSampler.py => audio_data_sampler.py} (100%) rename sonosco/datasets/{AudioDataset.py => audio_dataset.py} (78%) create mode 100644 tests/test_dataset.py diff --git a/requirements.txt b/requirements.txt index 46e6bd6..3547e7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ torchvision==0.3.0 tqdm==4.32.1 pyyaml==5.1 wget==3.2 +pytest \ No newline at end of file diff --git a/sonosco/common/click_extensions.py b/sonosco/common/click_extensions.py index 18ff96f..3554572 100644 --- a/sonosco/common/click_extensions.py +++ b/sonosco/common/click_extensions.py @@ -3,7 +3,7 @@ import logging -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) class PythonLiteralOption(click.Option): @@ -12,5 +12,5 @@ def type_cast_value(self, ctx, value): try: return ast.literal_eval(value) except Exception as e: - logger.error(f"Malformed click input for PythonLiteralOption {e}", exc_info=True) + LOGGER.error(f"Malformed click input for PythonLiteralOption {e}", exc_info=True) raise click.BadParameter(value) diff --git a/sonosco/datasets/AudioDataLoader.py b/sonosco/datasets/audio_data_loader.py similarity index 100% rename from sonosco/datasets/AudioDataLoader.py rename to sonosco/datasets/audio_data_loader.py diff --git a/sonosco/datasets/AudioDataSampler.py b/sonosco/datasets/audio_data_sampler.py similarity index 100% rename from sonosco/datasets/AudioDataSampler.py rename to sonosco/datasets/audio_data_sampler.py diff --git a/sonosco/datasets/AudioDataset.py b/sonosco/datasets/audio_dataset.py similarity index 78% rename from sonosco/datasets/AudioDataset.py rename to sonosco/datasets/audio_dataset.py index 098d195..8677396 100644 --- a/sonosco/datasets/AudioDataset.py +++ b/sonosco/datasets/audio_dataset.py @@ -16,7 +16,7 @@ from sonosco.common.constants import * -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) class DataProcessor: @@ -85,10 +85,10 @@ def parse_audio(self, audio_path): def parse_transcript(self, transcript_path): with open(transcript_path, 'r', encoding='utf8') as transcript_file: transcript = transcript_file.read().replace('\n', '') - logger.info(f"1: {transcript}") + LOGGER.info(f"1: {transcript}") # TODO: Is it fast enough? transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) - logger.info(f"transcript_path: {transcript_path} transcript: {transcript}") + LOGGER.info(f"transcript_path: {transcript_path} transcript: {transcript}") return transcript @@ -103,7 +103,7 @@ def __init__(self, processor: DataProcessor, manifest_filepath): :param processor: Data processor object :param manifest_filepath: Path to manifest csv as describe above """ - super(AudioDataset, self).__init__() + super().__init__() with open(manifest_filepath) as f: ids = f.readlines() ids = [x.strip().split(',') for x in ids] @@ -122,32 +122,3 @@ def __getitem__(self, index): def __len__(self): return self.size - - -def main(): - global logger - logger = logging.getLogger(SONOSCO) - setup_logging(logger) - - # create data processor - audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, - labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False) - processor = DataProcessor(**audio_conf) - - # get manifest file - manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech") - test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") - - # create audio dataset - test_dataset = AudioDataset(processor, manifest_filepath=test_manifest) - logger.info("Dataset is created") - test = test_dataset[0] - batch_size = 16 - sampler = BucketingSampler(test_dataset, batch_size=batch_size) - dataloader = DataLoader(dataset=test_dataset, num_workers=4, collate_fn=_collate_fn, batch_sampler=sampler) - test_dataset[0] - #inputs, targets, input_percentages, target_sizes = next(iter(dataloader)) - - -if __name__ == "__main__": - main() diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py index 70451ab..3b75e52 100644 --- a/sonosco/datasets/download_datasets/common_voice.py +++ b/sonosco/datasets/download_datasets/common_voice.py @@ -12,7 +12,7 @@ from sonosco.datasets.download_datasets.data_utils import create_manifest from sonosco.common.utils import setup_logging -logger = logging.getLogger("sonosco") +LOGGER = logging.getLogger("sonosco") COMMON_VOICE_URL = "https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz" @@ -60,7 +60,7 @@ def process(x): def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): - setup_logging(logger) + setup_logging(LOGGER) path_to_data = os.path.join(os.path.expanduser("~"), target_dir) path_utils.try_create_directory(path_to_data) @@ -74,8 +74,8 @@ def main(target_dir, sample_rate, files_to_use, min_duration, max_duration): path_utils.try_download(target_unpacked_dir, COMMON_VOICE_URL) - logger.info("Download complete") - logger.info("Unpacking...") + LOGGER.info("Download complete") + LOGGER.info("Unpacking...") print("Unpacking corpus to {} ...".format(target_unpacked_dir)) tar = tarfile.open(target_unpacked_dir) diff --git a/sonosco/datasets/download_datasets/data_utils.py b/sonosco/datasets/download_datasets/data_utils.py index e7e15d5..167eb92 100644 --- a/sonosco/datasets/download_datasets/data_utils.py +++ b/sonosco/datasets/download_datasets/data_utils.py @@ -7,15 +7,15 @@ from tqdm import tqdm -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) def create_manifest(data_path, output_path, min_duration=None, max_duration=None): - logger.info(f"Creating a manifest for path: {data_path}") + LOGGER.info(f"Creating a manifest for path: {data_path}") file_paths = [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(data_path) for f in fnmatch.filter(files, '*.wav')] - logger.info(f"Found {len(file_paths)} .wav files") + LOGGER.info(f"Found {len(file_paths)} .wav files") file_paths = order_and_prune_files(file_paths, min_duration, max_duration) with io.FileIO(output_path, "w") as file: for wav_path in tqdm(file_paths, total=len(file_paths)): @@ -25,11 +25,11 @@ def create_manifest(data_path, output_path, min_duration=None, max_duration=None def order_and_prune_files(file_paths, min_duration, max_duration): - logger.info("Sorting manifests...") + LOGGER.info("Sorting manifests...") path_and_duration = [(path, audio_tools.get_duration(path)) for path in file_paths] if min_duration and max_duration: - logger.info(f"Pruning manifests between {min_duration} and {max_duration} seconds") + LOGGER.info(f"Pruning manifests between {min_duration} and {max_duration} seconds") path_and_duration = [(path, duration) for path, duration in path_and_duration if min_duration <= duration <= max_duration] diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index 59df937..d4fb0e7 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -12,7 +12,7 @@ from tqdm import tqdm -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) LIBRI_SPEECH_URLS = { @@ -52,28 +52,29 @@ def try_download_librispeech(target_dir, sample_rate, files_to_use, min_duration if url.find(f) != -1: dl_flag = True if not dl_flag: - logger.info(f"Skipping url: {url}") + LOGGER.info(f"Skipping url: {url}") continue filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) + LOGGER.info(f"Downloading from {url}") path_utils.try_download(target_filename, url) - logger.info("Download complete") - logger.info(f"Unpacking {filename}...") + LOGGER.info("Download complete") + LOGGER.info(f"Unpacking {filename}...") tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) assert os.path.exists(extracted_dir), f"Archive {filename} was not properly uncompressed" - logger.info("Converting flac files to wav and extracting transcripts...") + LOGGER.info("Converting flac files to wav and extracting transcripts...") for root, subdirs, files in tqdm(os.walk(extracted_dir)): for f in files: if f.find(".flac") != -1: _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, base_filename=f, root_dir=root, sample_rate=sample_rate) - logger.info(f"Finished {url}") + LOGGER.info(f"Finished {url}") shutil.rmtree(extracted_dir) manifest_path = os.path.join(path_to_data, f"libri_{split_type}_manifest.csv") @@ -121,9 +122,9 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate): help="Prunes training samples longer than the max duration (given in seconds).") def main(**kwargs): """Processes and downloads LibriSpeech dataset.""" - global logger - logger = logging.getLogger(SONOSCO) - setup_logging(logger) + global LOGGER + LOGGER = logging.getLogger(SONOSCO) + setup_logging(LOGGER) try_download_librispeech(**kwargs) diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 0000000..977b549 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,53 @@ +import logging +import os + +from sonosco.common.constants import SONOSCO +from sonosco.datasets.audio_dataset import AudioDataset, DataProcessor +from sonosco.datasets.audio_data_sampler import BucketingSampler +from sonosco.datasets.audio_data_loader import DataLoader +from sonosco.datasets.download_datasets.librispeech import try_download_librispeech + + +LOGGER = logging.getLogger(SONOSCO) +LIBRI_SPEECH_DIR = "temp/test_data/libri_speech" + + +def test_librispeech_download(): + # prepare + if os.path.exists(LIBRI_SPEECH_DIR): + os.removedirs(LIBRI_SPEECH_DIR) + + # get manifest file + manifest_directory = os.path.join(os.path.expanduser("~"), LIBRI_SPEECH_DIR) + test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") + + if not os.path.exists(test_manifest): + try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15) + + assert os.path.exists(test_manifest) + + +def test_librispeech_clean(): + # create data processor + audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, + labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False) + processor = DataProcessor(**audio_conf) + + # get manifest file + manifest_directory = os.path.join(os.path.expanduser("~"), LIBRI_SPEECH_DIR) + test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") + + if not os.path.exists(test_manifest): + try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15) + + assert os.path.exists(test_manifest) + + # create audio dataset + test_dataset = AudioDataset(processor, manifest_filepath=test_manifest) + LOGGER.info("Dataset is created") + test = test_dataset[0] + batch_size = 16 + sampler = BucketingSampler(test_dataset, batch_size=batch_size) + dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler) + test_dataset[0] + From 2e8de3abab3fc1e5195ab8a488e0d11a8eaa78dd Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sun, 16 Jun 2019 12:39:08 +0200 Subject: [PATCH 35/58] add augmentation utils --- sonosco/common/audio_tools.py | 21 ++++++++++++++++++++- sonosco/datasets/audio_dataset.py | 22 ++++++++++++---------- tests/test_dataset.py | 2 +- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py index cd7a2bc..4cd1ac8 100644 --- a/sonosco/common/audio_tools.py +++ b/sonosco/common/audio_tools.py @@ -1,4 +1,6 @@ import subprocess +import numpy as np +import librosa def get_duration(file_path): @@ -8,8 +10,25 @@ def get_duration(file_path): def transcode_recording(source, destination, sample_rate): subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination}"], shell=True) + def transcode_recordings_an4(raw_path, wav_path, sample_rate): subprocess.call([f'sox -t raw -r {sample_rate} -b 16 -e signed-integer -B -c 1 \"{raw_path}\" \"{wav_path}\"'], shell=True) + def transcode_recordings_ted3(source, destination, start_time, end_time, sample_rate): - subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True) \ No newline at end of file + subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True) + + +def add_noise(audio, std=0.005): + noise = np.random.randn(len(audio)) + data_noise = audio + std * noise + return data_noise + + +def shift(audio, n_samples=1600): + return np.roll(audio, n_samples) + + +def stretch(audio, rate=1): + stretched_audio = librosa.effects.time_stretch(audio, rate) + return stretched_audio diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py index 8677396..3d7538b 100644 --- a/sonosco/datasets/audio_dataset.py +++ b/sonosco/datasets/audio_dataset.py @@ -3,17 +3,14 @@ # https://github.com/SeanNaren/deepspeech.pytorch # ---------------------------------------------------------------------------- -import warnings -import os import logging import torch import torchaudio +import librosa import sonosco.config.global_settings as global_settings +import sonosco.common.audio_tools as audio_tools -from typing import Tuple from torch.utils.data import Dataset -from sonosco.common.utils import setup_logging -from sonosco.common.constants import * LOGGER = logging.getLogger(__name__) @@ -55,10 +52,12 @@ def retrieve_file(audio_path): return sound, sample_rate @staticmethod - def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)): - """Changes tempo and gain of the wave.""" - warnings.warn("Augmentation is not implemented") # TODO: Implement - return sound + def augment_audio(sound): + sound_array = sound.numpy().squeeze() + stretched_sound = audio_tools.stretch(sound_array, 0.5) + import pdb; pdb.set_trace() + stretched_sound = audio_tools.shift(stretched_sound, 4000) + return torch.from_numpy(stretched_sound) def parse_audio(self, audio_path): sound, sample_rate = self.retrieve_file(audio_path) @@ -66,8 +65,11 @@ def parse_audio(self, audio_path): if sample_rate != self.sample_rate: raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") + librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound.numpy().transpose(), sample_rate) + if self.augment: - sound = self.augment_audio(sound) + stretched_sound = self.augment_audio(sound) + librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound.numpy().transpose(), sample_rate) if global_settings.CUDA_ENABLED: sound = sound.cuda() diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 977b549..359844f 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -30,7 +30,7 @@ def test_librispeech_download(): def test_librispeech_clean(): # create data processor audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, - labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False) + labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True) processor = DataProcessor(**audio_conf) # get manifest file From 420608068f7622d378a7bd9dd84460c04156e281 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sun, 16 Jun 2019 13:52:23 +0200 Subject: [PATCH 36/58] change to librosa --- sonosco/datasets/audio_dataset.py | 32 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py index 3d7538b..a38d0fd 100644 --- a/sonosco/datasets/audio_dataset.py +++ b/sonosco/datasets/audio_dataset.py @@ -5,8 +5,8 @@ import logging import torch -import torchaudio import librosa +import numpy as np import sonosco.config.global_settings as global_settings import sonosco.common.audio_tools as audio_tools @@ -46,18 +46,15 @@ def window_stride_samples(self): def window_size_samples(self): return int(self.sample_rate * self.window_stride) - @staticmethod - def retrieve_file(audio_path): - sound, sample_rate = torchaudio.load(audio_path) + def retrieve_file(self, audio_path): + sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate) return sound, sample_rate @staticmethod def augment_audio(sound): - sound_array = sound.numpy().squeeze() - stretched_sound = audio_tools.stretch(sound_array, 0.5) - import pdb; pdb.set_trace() - stretched_sound = audio_tools.shift(stretched_sound, 4000) - return torch.from_numpy(stretched_sound) + augmented = audio_tools.stretch(sound, 0.5) + augmented = audio_tools.shift(augmented, 4000) + return augmented def parse_audio(self, audio_path): sound, sample_rate = self.retrieve_file(audio_path) @@ -65,22 +62,23 @@ def parse_audio(self, audio_path): if sample_rate != self.sample_rate: raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") - librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound.numpy().transpose(), sample_rate) + librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound, sample_rate) if self.augment: stretched_sound = self.augment_audio(sound) - librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound.numpy().transpose(), sample_rate) + librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound, sample_rate) if global_settings.CUDA_ENABLED: sound = sound.cuda() # TODO: comment why take the last element? - spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()), - n_fft=self.window_size_samples, - hop_length=self.window_stride_samples, - win_length=self.window_size_samples, - window=torch.hamming_window(self.window_size_samples), - normalized=self.normalize)[:, :, -1] + D = librosa.stft(sound, + n_fft=self.window_size_samples, + hop_length=self.window_stride_samples, + win_length=self.window_size_samples) + spectrogram, phase = librosa.magphase(D) + # S = log(S+1) + spectrogram = torch.from_numpy(np.log1p(spectrogram)) return spectrogram From 7f4b362376022f0c735e3e5257e0a659848938df Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sun, 16 Jun 2019 15:22:53 +0200 Subject: [PATCH 37/58] add augmentation --- requirements.txt | 3 ++- sonosco/common/audio_tools.py | 7 ++++- sonosco/common/utils.py | 5 ++++ sonosco/datasets/audio_data_sampler.py | 1 + sonosco/datasets/audio_dataset.py | 16 ++++++++--- .../datasets/download_datasets/librispeech.py | 5 ++-- tests/test_dataset.py | 27 ++++++++++++------- 7 files changed, 46 insertions(+), 18 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3547e7e..eaaa70e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,5 @@ torchvision==0.3.0 tqdm==4.32.1 pyyaml==5.1 wget==3.2 -pytest \ No newline at end of file +pytest +click \ No newline at end of file diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py index 4cd1ac8..45367a6 100644 --- a/sonosco/common/audio_tools.py +++ b/sonosco/common/audio_tools.py @@ -19,7 +19,7 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_ subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True) -def add_noise(audio, std=0.005): +def add_noise(audio, std=0.002): noise = np.random.randn(len(audio)) data_noise = audio + std * noise return data_noise @@ -32,3 +32,8 @@ def shift(audio, n_samples=1600): def stretch(audio, rate=1): stretched_audio = librosa.effects.time_stretch(audio, rate) return stretched_audio + + +def pitch_shift(audio, sample_rate=16000, n_steps=3.0): + stretched_audio = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=n_steps) + return stretched_audio diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py index a570af0..5e86342 100644 --- a/sonosco/common/utils.py +++ b/sonosco/common/utils.py @@ -1,5 +1,6 @@ import logging import os +import numpy as np def setup_logging(logger: logging.Logger, filename=None, verbosity=False): @@ -19,3 +20,7 @@ def setup_logging(logger: logging.Logger, filename=None, verbosity=False): c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') c_handler.setFormatter(c_format) logger.addHandler(c_handler) + + +def random_float(low: float, high: float): + return np.random.random() * (high - low) + low diff --git a/sonosco/datasets/audio_data_sampler.py b/sonosco/datasets/audio_data_sampler.py index 100bde7..b3bfc14 100644 --- a/sonosco/datasets/audio_data_sampler.py +++ b/sonosco/datasets/audio_data_sampler.py @@ -6,6 +6,7 @@ from torch.distributed.deprecated import get_rank from torch.distributed.deprecated import get_world_size + class BucketingSampler(Sampler): def __init__(self, data_source, batch_size=1): """ diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py index a38d0fd..10a0a80 100644 --- a/sonosco/datasets/audio_dataset.py +++ b/sonosco/datasets/audio_dataset.py @@ -9,11 +9,17 @@ import numpy as np import sonosco.config.global_settings as global_settings import sonosco.common.audio_tools as audio_tools +import sonosco.common.utils as utils from torch.utils.data import Dataset LOGGER = logging.getLogger(__name__) +MIN_STRETCH = 0.7 +MAX_STRETCH = 1.3 +MIN_PITCH = 0.5 +MAX_PITCH = 2.0 +MAX_SHIFT = 4000 class DataProcessor: @@ -50,10 +56,12 @@ def retrieve_file(self, audio_path): sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate) return sound, sample_rate - @staticmethod - def augment_audio(sound): - augmented = audio_tools.stretch(sound, 0.5) - augmented = audio_tools.shift(augmented, 4000) + def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True): + augmented = audio_tools.stretch(sound, utils.random_float(MIN_STRETCH, MAX_STRETCH)) if stretch else sound + augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented + augmented = audio_tools.pitch_shift(augmented, self.sample_rate, + n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented + augmented = audio_tools.add_noise(augmented) if noise else augmented return augmented def parse_audio(self, audio_path): diff --git a/sonosco/datasets/download_datasets/librispeech.py b/sonosco/datasets/download_datasets/librispeech.py index d4fb0e7..246f0e2 100644 --- a/sonosco/datasets/download_datasets/librispeech.py +++ b/sonosco/datasets/download_datasets/librispeech.py @@ -122,11 +122,10 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir, sample_rate): help="Prunes training samples longer than the max duration (given in seconds).") def main(**kwargs): """Processes and downloads LibriSpeech dataset.""" - global LOGGER - LOGGER = logging.getLogger(SONOSCO) - setup_logging(LOGGER) try_download_librispeech(**kwargs) if __name__ == "__main__": + LOGGER = logging.getLogger(SONOSCO) + setup_logging(LOGGER) main() diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 359844f..45db22c 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,18 +1,26 @@ import logging import os +import pytest from sonosco.common.constants import SONOSCO +from sonosco.common.utils import setup_logging from sonosco.datasets.audio_dataset import AudioDataset, DataProcessor from sonosco.datasets.audio_data_sampler import BucketingSampler from sonosco.datasets.audio_data_loader import DataLoader from sonosco.datasets.download_datasets.librispeech import try_download_librispeech -LOGGER = logging.getLogger(SONOSCO) LIBRI_SPEECH_DIR = "temp/test_data/libri_speech" -def test_librispeech_download(): +@pytest.fixture +def logger(): + logger = logging.getLogger(SONOSCO) + setup_logging(logger) + return logger + + +def test_librispeech_download(logger): # prepare if os.path.exists(LIBRI_SPEECH_DIR): os.removedirs(LIBRI_SPEECH_DIR) @@ -22,12 +30,13 @@ def test_librispeech_download(): test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") if not os.path.exists(test_manifest): + logger.info("Starting to download dataset") try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15) assert os.path.exists(test_manifest) -def test_librispeech_clean(): +def test_librispeech_clean(logger): # create data processor audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True) @@ -44,10 +53,10 @@ def test_librispeech_clean(): # create audio dataset test_dataset = AudioDataset(processor, manifest_filepath=test_manifest) - LOGGER.info("Dataset is created") - test = test_dataset[0] - batch_size = 16 - sampler = BucketingSampler(test_dataset, batch_size=batch_size) - dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler) - test_dataset[0] + logger.info("Dataset is created") + test = test_dataset[4] + # batch_size = 16 + # sampler = BucketingSampler(test_dataset, batch_size=batch_size) + # dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler) + # test_dataset[0] From f96661f938bd4f0f8b07af161b0f9bb0c96c957a Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sun, 16 Jun 2019 15:27:02 +0200 Subject: [PATCH 38/58] remove test code --- sonosco/datasets/audio_dataset.py | 37 ++++++++++++++++++++----------- tests/test_dataset.py | 23 ++++++++++++++++--- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py index 10a0a80..0dea305 100644 --- a/sonosco/datasets/audio_dataset.py +++ b/sonosco/datasets/audio_dataset.py @@ -17,8 +17,8 @@ LOGGER = logging.getLogger(__name__) MIN_STRETCH = 0.7 MAX_STRETCH = 1.3 -MIN_PITCH = 0.5 -MAX_PITCH = 2.0 +MIN_PITCH = 0.7 +MAX_PITCH = 1.5 MAX_SHIFT = 4000 @@ -64,27 +64,29 @@ def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True augmented = audio_tools.add_noise(augmented) if noise else augmented return augmented - def parse_audio(self, audio_path): + def parse_audio(self, audio_path, raw=False): sound, sample_rate = self.retrieve_file(audio_path) if sample_rate != self.sample_rate: raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") - librosa.output.write_wav("/Users/yuriy/Desktop/original.wav", sound, sample_rate) - if self.augment: - stretched_sound = self.augment_audio(sound) - librosa.output.write_wav("/Users/yuriy/Desktop/stretched.wav", stretched_sound, sample_rate) + sound = self.augment_audio(sound) + + if raw: + return sound + + sound_tensor = torch.from_numpy(sound) if global_settings.CUDA_ENABLED: - sound = sound.cuda() + sound_tensor = sound_tensor.cuda() # TODO: comment why take the last element? - D = librosa.stft(sound, - n_fft=self.window_size_samples, - hop_length=self.window_stride_samples, - win_length=self.window_size_samples) - spectrogram, phase = librosa.magphase(D) + complex_spectrogram = librosa.stft(sound_tensor, + n_fft=self.window_size_samples, + hop_length=self.window_stride_samples, + win_length=self.window_size_samples) + spectrogram, phase = librosa.magphase(complex_spectrogram) # S = log(S+1) spectrogram = torch.from_numpy(np.log1p(spectrogram)) @@ -119,6 +121,15 @@ def __init__(self, processor: DataProcessor, manifest_filepath): self.size = len(ids) self.processor = processor + def get_raw(self, index): + sample = self.ids[index] + audio_path, transcript_path = sample[0], sample[1] + + sound = self.processor.parse_audio(audio_path, raw=True) + transcript = self.processor.parse_transcript(transcript_path) + + return sound, transcript + def __getitem__(self, index): sample = self.ids[index] audio_path, transcript_path = sample[0], sample[1] diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 45db22c..a80c5c9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,6 +1,8 @@ import logging import os import pytest +import numpy as np +import librosa from sonosco.common.constants import SONOSCO from sonosco.common.utils import setup_logging @@ -11,6 +13,8 @@ LIBRI_SPEECH_DIR = "temp/test_data/libri_speech" +TEST_WAVS_DIR = "test_wavs" +SAMPLE_RATE = 16000 @pytest.fixture @@ -38,7 +42,7 @@ def test_librispeech_download(logger): def test_librispeech_clean(logger): # create data processor - audio_conf = dict(sample_rate=16000, window_size=.02, window_stride=.01, + audio_conf = dict(sample_rate=SAMPLE_RATE, window_size=.02, window_stride=.01, labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True) processor = DataProcessor(**audio_conf) @@ -47,14 +51,27 @@ def test_librispeech_clean(logger): test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") if not os.path.exists(test_manifest): - try_download_librispeech(LIBRI_SPEECH_DIR, 16000, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15) + try_download_librispeech(LIBRI_SPEECH_DIR, SAMPLE_RATE, ["test-clean.tar.gz", "test-other.tar.gz"], 1, 15) assert os.path.exists(test_manifest) # create audio dataset test_dataset = AudioDataset(processor, manifest_filepath=test_manifest) logger.info("Dataset is created") - test = test_dataset[4] + + if os.path.exists(TEST_WAVS_DIR): + os.removedirs(TEST_WAVS_DIR) + + os.makedirs(TEST_WAVS_DIR) + + n_samples = len(test_dataset) + + ids = np.random.randint(n_samples, size=min(10, n_samples)) + + for index in ids: + sound, transcription = test_dataset.get_raw(index) + librosa.output.write_wav(os.path.join(TEST_WAVS_DIR, f"audio_{index}.wav"), sound, SAMPLE_RATE) + # batch_size = 16 # sampler = BucketingSampler(test_dataset, batch_size=batch_size) # dataloader = DataLoader(dataset=test_dataset, num_workers=4, batch_sampler=sampler) From 8d59622c84625adfce596cefe866697f814735a7 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sun, 16 Jun 2019 16:09:01 +0200 Subject: [PATCH 39/58] start adding experiment class --- sonosco/training/__init__.py | 0 sonosco/training/experiment.py | 90 ++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 sonosco/training/__init__.py create mode 100644 sonosco/training/experiment.py diff --git a/sonosco/training/__init__.py b/sonosco/training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sonosco/training/experiment.py b/sonosco/training/experiment.py new file mode 100644 index 0000000..dd04c46 --- /dev/null +++ b/sonosco/training/experiment.py @@ -0,0 +1,90 @@ +import os +import os.path as path +import sys +from time import time +import datetime + +from .tee import Tee +from .utils import * + + +class Experiment: + """ + Generates a folder where all experiments will be stored an then a named experiment with current + timestamp and provided name. Automatically starts logging the console output and creates a copy + of the currently executed code in the experiment folder. The experiment's subfolder paths are provided + to the outside as member variables. It also allows adding of more subfolders conveniently. + Args: + experiment_name (string): name of the exerpiment to be created + experiments_path (string): location where all experiments will be stored, default is './experiments' + Example: + >>> experiment = Experiment('mnist_classification') + >>> print(experiment.plots) # path to experiment plots + """ + + def __init__(self, experiment_name, experiments_path=None, exclude_dirs=[], exclude_files=[]): + self.experiments_path = self._set_experiments_dir(experiments_path) + self.name = self._set_experiment_name(experiment_name) + self.path = path.join(self.experiments_path, self.name) # path to current experiment + self._sub_directories = ['plots', 'logs', 'code'] # default sub-directories + + self._exclude_dirs = ['__pycache__', '.git', 'experiments'] + self._exclude_dirs.extend(exclude_dirs) + self._exclude_files = ['.pyc'] + self._exclude_files.extend(exclude_files) + + + self._init_directories() + self._tee = Tee(path.join(self.logs, 'console_output.log'), 'w') # start to log console + self._copy_sourcecode() + + def _set_experiments_dir(self, experiments_path): + if experiments_path != None: + return experiments_path + local_path = os.path.dirname(sys.argv[0]) + local_path = local_path if local_path != '' else './' + return path.join(local_path, "experiments") + + def _set_experiment_name(self, experiment_name): + date_time = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H:%M:%S') + return date_time + "_" + experiment_name + + def _init_directories(self): + """ Create all basic directories. """ + self._create_directory(self.experiments_path) + self._create_directory(path.join(self.experiments_path, self.name)) + for sub_dir_name in self._sub_directories: + self.add_directory(sub_dir_name) + + def _create_directory(self, dir_path): + if not path.exists(dir_path): + os.makedirs(dir_path) + + def _add_member(self, key, value): + """ Add a member variable named 'key' with value 'value' to the experiment instance. """ + self.__dict__[key] = value + + def _copy_sourcecode(self): + """ Copy code from execution directory in experiment code directory. """ + sources_path = os.path.dirname(sys.argv[0]) + sources_path = sources_path if sources_path != '' else './' + copy_code(sources_path, self.code, exclude_dirs=self._exclude_dirs, exclude_files=self._exclude_files )# exclude_dirs=[path.basename(self.experiments_path), '.vscode', '.git']) + + def add_directory(self, dir_name): + """ + Add a sub-directory to the experiment. The directory will be automatically + created and provided to the outside as a member variable. + """ + # store in sub-dir list + if not dir_name in self._sub_directories: + self._sub_directories.append(dir_name) + # add as member + dir_path = path.join(self.experiments_path, self.name, dir_name) + self._add_member(dir_name, dir_path) + # create directory + self._create_directory(dir_path) + + def add_file(self, folder_path, filename, content): + """ Adds a file with provided content to folder. Convenience function. """ + with open(path.join(folder_path, filename), 'w') as textfile: + textfile.write(content) \ No newline at end of file From 7174f9c843df7ce15287ec7da138bc928c1202b0 Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sun, 16 Jun 2019 17:11:44 +0200 Subject: [PATCH 40/58] Initial class structure and save methods --- requirements.txt | 3 +- sonosco/model.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 sonosco/model.py diff --git a/requirements.txt b/requirements.txt index 3547e7e..ae5ddac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,5 @@ torchvision==0.3.0 tqdm==4.32.1 pyyaml==5.1 wget==3.2 -pytest \ No newline at end of file +pytest +deprecation==2.0.6 \ No newline at end of file diff --git a/sonosco/model.py b/sonosco/model.py new file mode 100644 index 0000000..5305556 --- /dev/null +++ b/sonosco/model.py @@ -0,0 +1,90 @@ +import logging +import torch +import deprecation + +LOGGER = logging.getLogger(__name__) + + +class Saver: + + def __init__(self) -> None: + super().__init__() + + @deprecation.deprecated( + details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead") + def save_model_simple(self, model, path): + """ + Simply saves the model using pickle protocol. + Args: + model: model to save + path (string) : path where to save the model + + Returns: + + """ + torch.save(model, path) + + def save_model(self, model, path, infer_structure=False, serialize_f_name='serialize'): + """ + Saves the model using pickle protocol. + + If the infer_structure is True this method infers all the meta parameters of the model and save them together + with learnable parameters. + + If the infer_structure is False and method specified by serialize_f_name exists, the return value of the + serialize_f_name method is saved. + + If neither of above only learnable parameters a.k.a. state_dict are saved. + + Args: + model: model to save + path (string) : path where to save the model + infer_structure (bool): indicator whether to infer the model structure + serialize_f_name (string): name of the function that this method should call in order to serialize the model + + Returns: + + """ + entity_to_save = None + if infer_structure: + entity_to_save = self.get_constructor_args_with_values(model) + entity_to_save['state_dict'] = model.state_dict() + elif hasattr(model, serialize_f_name) and callable(getattr(model, serialize_f_name)): + entity_to_save = getattr(model, serialize_f_name)() + else: + entity_to_save['state_dict'] = model.state_dict() + + torch.save(entity_to_save, path) + + @staticmethod + def get_constructor_args_with_values(model): + """ + Assigns values to __init__ params names + + For example: + + class Bar(): + def __init__(self, arg1, arg2): + self.arg1 = arg1 + self.some_other_name = args2 + + + bar = Bar("A","B") + get_constructor_args_with_values(bar) + # returns {arg1: arg1_val, arg2: arg2_val} + + + Args: + model: model to infre from + + Returns (dict): Mapping from __init__ argument to it's value + + """ + return {} + + +class Loader: + + def load_model(self, cls, path): + package = torch.load(path, map_location=lambda storage, loc: storage) + cls() From fb026367279db3f4f81e6a6772cd15dab068a2d3 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sun, 16 Jun 2019 18:48:55 +0200 Subject: [PATCH 41/58] resolve comments --- requirements.txt | 4 ++-- sonosco/common/audio_tools.py | 25 +++++++++++++++++++++---- tests/test_dataset.py | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index eaaa70e..97f2580 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,5 +20,5 @@ torchvision==0.3.0 tqdm==4.32.1 pyyaml==5.1 wget==3.2 -pytest -click \ No newline at end of file +pytest==4.6.3 +click==7.0 \ No newline at end of file diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py index 45367a6..9ab77ab 100644 --- a/sonosco/common/audio_tools.py +++ b/sonosco/common/audio_tools.py @@ -19,10 +19,27 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_ subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True) -def add_noise(audio, std=0.002): - noise = np.random.randn(len(audio)) - data_noise = audio + std * noise - return data_noise +class NoiseMaker: + + def __call__(self, audio): + """Adds noise to the audio signal.""" + pass + + +class GaussianNoiseMaker(NoiseMaker): + + def __init__(self, std=0.002): + self.std = std + + def __call__(self, audio): + noise = np.random.randn(len(audio)) + return audio + self.std * noise + + +def add_noise(audio, noise_maker: NoiseMaker = None): + if noise_maker is None: + noise_maker = GaussianNoiseMaker() + return noise_maker(audio) def shift(audio, n_samples=1600): diff --git a/tests/test_dataset.py b/tests/test_dataset.py index a80c5c9..ddc70e9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -43,7 +43,7 @@ def test_librispeech_download(logger): def test_librispeech_clean(logger): # create data processor audio_conf = dict(sample_rate=SAMPLE_RATE, window_size=.02, window_stride=.01, - labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True) + labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=False) processor = DataProcessor(**audio_conf) # get manifest file From d82fa16ab56f4a3039c373ce0011df436409b42e Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sun, 16 Jun 2019 22:42:57 +0200 Subject: [PATCH 42/58] Added model loading and some utils --- sonosco/common/class_utils.py | 36 +++++++++++ sonosco/model.py | 112 ++++++++++++++++++++++++++++------ 2 files changed, 131 insertions(+), 17 deletions(-) create mode 100644 sonosco/common/class_utils.py diff --git a/sonosco/common/class_utils.py b/sonosco/common/class_utils.py new file mode 100644 index 0000000..b73cafa --- /dev/null +++ b/sonosco/common/class_utils.py @@ -0,0 +1,36 @@ +import inspect +from typing import List + + +def get_constructor_args(cls: type) -> List[str]: + """ + E.g. + + class Bar(): + def __init__(self, arg1, arg2): + + get_constructor_args(BAR) + # returns ['arg1', 'arg2'] + Args: + cls (type): + + Returns: list containing names of constructor arguments + + """ + return inspect.getfullargspec(cls.__init__).args[1:] + + +def get_class_by_name(name: str) -> type: + """ + Returns type object of class specified by name + Args: + name: full name of the class (with packages) + + Returns: class object + + """ + components = name.split('.') + mod = __import__(components[0]) + for comp in components[1:]: + mod = getattr(mod, comp) + return mod diff --git a/sonosco/model.py b/sonosco/model.py index 5305556..5bbbdea 100644 --- a/sonosco/model.py +++ b/sonosco/model.py @@ -1,6 +1,10 @@ import logging import torch import deprecation +import inspect +import torch.nn as nn + +from common.class_utils import get_constructor_args, get_class_by_name LOGGER = logging.getLogger(__name__) @@ -12,52 +16,53 @@ def __init__(self) -> None: @deprecation.deprecated( details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead") - def save_model_simple(self, model, path): + def save_model_simple(self, model: nn.Module, path: str) -> None: """ Simply saves the model using pickle protocol. Args: - model: model to save - path (string) : path where to save the model + model (nn.Module): model to save + path (str) : path where to save the model Returns: """ torch.save(model, path) - def save_model(self, model, path, infer_structure=False, serialize_f_name='serialize'): + def save_model(self, model: nn.Module, path: str, infer_structure: bool = False, + serialize_method_name: str = 'serialize') -> None: """ Saves the model using pickle protocol. If the infer_structure is True this method infers all the meta parameters of the model and save them together with learnable parameters. - If the infer_structure is False and method specified by serialize_f_name exists, the return value of the - serialize_f_name method is saved. + If the infer_structure is False and method specified by serialize_method_name exists, the return value of the + serialize_method_name method is saved. If neither of above only learnable parameters a.k.a. state_dict are saved. Args: - model: model to save - path (string) : path where to save the model + model (nn.Module): model to save + path (str) : path where to save the model infer_structure (bool): indicator whether to infer the model structure - serialize_f_name (string): name of the function that this method should call in order to serialize the model + serialize_method_name (str): name of the function that this method should call in order to serialize the + model. Must return dict. Returns: """ - entity_to_save = None if infer_structure: entity_to_save = self.get_constructor_args_with_values(model) entity_to_save['state_dict'] = model.state_dict() - elif hasattr(model, serialize_f_name) and callable(getattr(model, serialize_f_name)): - entity_to_save = getattr(model, serialize_f_name)() + elif hasattr(model, serialize_method_name) and callable(getattr(model, serialize_method_name)): + entity_to_save = getattr(model, serialize_method_name)() else: - entity_to_save['state_dict'] = model.state_dict() + entity_to_save = {'state_dict': model.state_dict()} torch.save(entity_to_save, path) @staticmethod - def get_constructor_args_with_values(model): + def get_constructor_args_with_values(model: nn.Module): """ Assigns values to __init__ params names @@ -75,7 +80,7 @@ def __init__(self, arg1, arg2): Args: - model: model to infre from + model (nn.Module): model to infre from Returns (dict): Mapping from __init__ argument to it's value @@ -85,6 +90,79 @@ def __init__(self, arg1, arg2): class Loader: - def load_model(self, cls, path): + @deprecation.deprecated( + details="This type of loading may cause problems when path of model class changes. " + "Pleas use only when saved with save_model_simple method") + def load_model_simple(self, path: str): + """ + + Args: + path: + + Returns: + + """ + return torch.load(path) + + def load_model_from_path(self, cls_path: str, path: str, deserialize_method_name: str = 'deserialize') -> nn.Module: + """ + Loads the model from pickle file. + + If deserialize_method_name exists the deserialized content of pickle file in path is passed to the + deserialize_method_name method. In this case, + the responsibility of creating cls object stays at the caller side. + + Args: + cls_path (str): name of the class of the model + path (str): path to pickle-serialized model or model parameters + deserialize_method_name (str): name of the function that this method should call in order to deserialize the + model. Must accept single argument of type dict. + + + Returns (nn.Module): Loaded model + + """ + return self.load_model(get_class_by_name(cls_path), path, deserialize_method_name) + + def load_model(self, cls: type, path: str, deserialize_method_name: str = 'deserialize') -> nn.Module: + """ + Loads the model from pickle file. + + If deserialize_method_name exists the deserialized content of pickle file in path is passed to the + deserialize_method_name method. In this case, + the responsibility of creating cls object stays at the caller side. + + Args: + cls (type): class object of the model + path (str): path to pickle-serialized model or model parameters + deserialize_method_name (str): name of the function that this method should call in order to deserialize the + model. Must accept single argument of type dict. + + + Returns (nn.Module): Loaded model + + """ package = torch.load(path, map_location=lambda storage, loc: storage) - cls() + if hasattr(cls, deserialize_method_name) and callable(getattr(cls, deserialize_method_name)): + return getattr(cls, deserialize_method_name)(package) + constructor_args = set(get_constructor_args(cls)) + stored_keys = set(package.keys()) + stored_keys.remove('state_dict') + + args_to_apply = constructor_args & stored_keys + if len(args_to_apply) != len(constructor_args): + not_in_constructor = stored_keys - constructor_args + if not_in_constructor: + LOGGER.warning( + f"Following fields were deserialized " + f"but could not be found in constructor of provided class {not_in_constructor}") + not_in_package = constructor_args - stored_keys + if not_in_package: + LOGGER.warning( + f"Following fields exist in class constructor " + f"but could not be found in serialized package {not_in_package}") + + filtered_package = {key: package[key] for key in stored_keys} + model = cls(**filtered_package) + model.load_state_dict(package['state_dict']) + return model From 678095d49ecdefe3dbb4100bd334418d5f3f5292 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Mon, 17 Jun 2019 18:22:54 +0200 Subject: [PATCH 43/58] add experiment --- sonosco/common/utils.py | 69 +++++++++++++++++++++++++++++++++- sonosco/training/experiment.py | 60 ++++++++++++++++------------- 2 files changed, 102 insertions(+), 27 deletions(-) diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py index 5e86342..4dcbaba 100644 --- a/sonosco/common/utils.py +++ b/sonosco/common/utils.py @@ -1,6 +1,11 @@ import logging -import os import numpy as np +import os +import subprocess +import os.path as path + +from shutil import copyfile +from typing import Tuple def setup_logging(logger: logging.Logger, filename=None, verbosity=False): @@ -24,3 +29,65 @@ def setup_logging(logger: logging.Logger, filename=None, verbosity=False): def random_float(low: float, high: float): return np.random.random() * (high - low) + low + + +def copy_code(source_dir, dest_dir, exclude_dirs: Tuple[str] = tuple(), exclude_files: Tuple[str] = tuple()): + """ + Copies code from source_dir to dest_dir. Excludes specified folders and files by substring-matching. + Parameters: + source_dir (string): location of the code to copy + dest_dir (string): location where the code should be copied to + exclude_dirs (list of strings): folders containing strings specified in this list will be ignored + exclude_files (list of strings): files containing strings specified in this list will be ignored + """ + source_basename = path.basename(source_dir) + for root, dirs, files in os.walk(source_dir, topdown=True): + + # skip ignored dirs + if any(ex_subdir in root for ex_subdir in exclude_dirs): + continue + + # construct destination dir + cropped_root = root[2:] if (root[:2] == './') else root + subdir_basename = path.basename(cropped_root) + + # do not treat the root as a subdir + if subdir_basename == source_basename: + subdir_basename = "" + dest_subdir = os.path.join(dest_dir, subdir_basename) + + # create destination folder + if not os.path.exists(dest_subdir): + os.makedirs(dest_subdir) + + # copy files + for filename in filter(lambda x: not any(substr in x for substr in exclude_files), files): + source_file_path = os.path.join(root, filename) + dest_file_path = os.path.join(dest_subdir, filename) + copyfile(source_file_path, dest_file_path) + + +def retrieve_git_hash(): + """ + Retrieves and returns the current gith hash if execution location is a git repo. + """ + try: + git_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip() + return git_hash + except subprocess.CalledProcessError as e: + print(e.output) + return False + + +def save_run_params_in_file(folder_path, run_config): + """ + Receives a run_config class, retrieves all member variables and saves them + in a config file for logging purposes. + Parameters: + folder_path - output folder + filename - output filename + run_config - shallow class with parameter members + """ + with open(path.join(folder_path, "run_params.conf"), 'w') as run_param_file: + for attr, value in sorted(run_config.__dict__.items()): + run_param_file.write(f"{attr}: {value}\n") diff --git a/sonosco/training/experiment.py b/sonosco/training/experiment.py index dd04c46..23f46e0 100644 --- a/sonosco/training/experiment.py +++ b/sonosco/training/experiment.py @@ -1,11 +1,11 @@ import os import os.path as path import sys -from time import time import datetime +import sonosco.common.path_utils as path_utils -from .tee import Tee -from .utils import * +from time import time +from sonosco.common.utils import copy_code class Experiment: @@ -22,44 +22,49 @@ class Experiment: >>> print(experiment.plots) # path to experiment plots """ - def __init__(self, experiment_name, experiments_path=None, exclude_dirs=[], exclude_files=[]): + def __init__(self, + experiment_name, + experiments_path=None, + sub_directories=("plots", "logs", "code"), + exclude_dirs=('__pycache__', '.git', 'experiments'), + exclude_files=('.pyc',)): + self.experiments_path = self._set_experiments_dir(experiments_path) self.name = self._set_experiment_name(experiment_name) - self.path = path.join(self.experiments_path, self.name) # path to current experiment - self._sub_directories = ['plots', 'logs', 'code'] # default sub-directories + self.path = path.join(self.experiments_path, self.name) # path to current experiment + self.logs = path.join(self.experiments_path, "logs") + self.code = path.join(self.experiments_path, "code") + self._sub_directories = sub_directories - self._exclude_dirs = ['__pycache__', '.git', 'experiments'] + self._exclude_dirs = exclude_dirs self._exclude_dirs.extend(exclude_dirs) - self._exclude_files = ['.pyc'] + self._exclude_files = exclude_files self._exclude_files.extend(exclude_files) - self._init_directories() - self._tee = Tee(path.join(self.logs, 'console_output.log'), 'w') # start to log console self._copy_sourcecode() - def _set_experiments_dir(self, experiments_path): - if experiments_path != None: + @staticmethod + def _set_experiments_dir(experiments_path): + if experiments_path is not None: return experiments_path + local_path = os.path.dirname(sys.argv[0]) local_path = local_path if local_path != '' else './' return path.join(local_path, "experiments") - def _set_experiment_name(self, experiment_name): + @staticmethod + def _set_experiment_name(experiment_name): date_time = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H:%M:%S') - return date_time + "_" + experiment_name + return f"{date_time}_{experiment_name}" def _init_directories(self): """ Create all basic directories. """ - self._create_directory(self.experiments_path) - self._create_directory(path.join(self.experiments_path, self.name)) + path_utils.try_create_directory(self.experiments_path) + path_utils.try_create_directory(path.join(self.experiments_path, self.name)) for sub_dir_name in self._sub_directories: self.add_directory(sub_dir_name) - def _create_directory(self, dir_path): - if not path.exists(dir_path): - os.makedirs(dir_path) - def _add_member(self, key, value): """ Add a member variable named 'key' with value 'value' to the experiment instance. """ self.__dict__[key] = value @@ -68,7 +73,9 @@ def _copy_sourcecode(self): """ Copy code from execution directory in experiment code directory. """ sources_path = os.path.dirname(sys.argv[0]) sources_path = sources_path if sources_path != '' else './' - copy_code(sources_path, self.code, exclude_dirs=self._exclude_dirs, exclude_files=self._exclude_files )# exclude_dirs=[path.basename(self.experiments_path), '.vscode', '.git']) + copy_code(sources_path, self.code, + exclude_dirs=self._exclude_dirs, + exclude_files=self._exclude_files) def add_directory(self, dir_name): """ @@ -76,15 +83,16 @@ def add_directory(self, dir_name): created and provided to the outside as a member variable. """ # store in sub-dir list - if not dir_name in self._sub_directories: + if dir_name not in self._sub_directories: self._sub_directories.append(dir_name) # add as member dir_path = path.join(self.experiments_path, self.name, dir_name) self._add_member(dir_name, dir_path) # create directory - self._create_directory(dir_path) + path_utils.try_create_directory(dir_path) - def add_file(self, folder_path, filename, content): + @staticmethod + def add_file(folder_path, filename, content): """ Adds a file with provided content to folder. Convenience function. """ - with open(path.join(folder_path, filename), 'w') as textfile: - textfile.write(content) \ No newline at end of file + with open(path.join(folder_path, filename), 'w') as text_file: + text_file.write(content) From fbb90023f9734fa01e8d70fe9628567beec03cc7 Mon Sep 17 00:00:00 2001 From: ga38nif Date: Tue, 18 Jun 2019 22:37:23 +0200 Subject: [PATCH 44/58] delete test script --- sonosco/datasets/datasets_test_script.py | 33 ------------------------ 1 file changed, 33 deletions(-) delete mode 100644 sonosco/datasets/datasets_test_script.py diff --git a/sonosco/datasets/datasets_test_script.py b/sonosco/datasets/datasets_test_script.py deleted file mode 100644 index f4daf79..0000000 --- a/sonosco/datasets/datasets_test_script.py +++ /dev/null @@ -1,33 +0,0 @@ -import os - -from AudioDataLoader import AudioDataLoader -from AudioDataSampler import BucketingSampler, DistributedBucketingSampler -from AudioDataset import AudioDataset - - -def main(): - labels_path = "/Users/florianlay/roboy/sonosco/sonosco/datasets/labels.json" - with open(labels_path) as label_file: - labels = str(''.join(json.load(label_file))) - - audio_conf = dict(sample_rate=16000, - window_size=.02, - window_stride=.01, - window='hamming') - - manifest_directory = os.path.join(os.path.expanduser("~"), "temp/data/libri_speech") - test_manifest = os.path.join(manifest_directory, "libri_test_clean_manifest.csv") - labels = 'abc' - test_dataset = AudioDataset(audio_conf=audio_conf, manifest_filepath=test_manifest, labels=labels, - normalize=False, augment=False) - print("Dataset is created\n====================\n") - - test = test_dataset[0] - batch_size = 16 - sampler = BucketingSampler(test_dataset, batch_size=batch_size) - dataloader = AudioDataLoader(test_dataset,num_workers=4, batch_sampler=sampler) - - inputs, targets, input_percentages, target_sizes = next(iter(dataloader)) - print(test) -if __name__ == "__main__": - main() \ No newline at end of file From 0590f01fc00dc0ea296baf9eccaad35dc27eb083 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Wed, 19 Jun 2019 17:34:35 +0200 Subject: [PATCH 45/58] start creating training interface --- .gitignore | 2 + sonosco/common/path_utils.py | 7 + sonosco/common/utils.py | 24 +- sonosco/config/train.yaml | 1 + sonosco/datasets/__init__.py | 3 + sonosco/datasets/audio_data_loader.py | 33 --- sonosco/datasets/audio_dataset.py | 4 +- sonosco/datasets/data_loader.py | 59 ++++ ...{audio_data_sampler.py => data_sampler.py} | 0 .../datasets/download_datasets/__init__.py | 3 + sonosco/models/__init__.py | 1 + sonosco/run_training.py | 39 +++ sonosco/train.py | 18 -- sonosco/training/__init__.py | 2 + sonosco/training/abstract_callback.py | 25 ++ sonosco/training/experiment.py | 28 +- sonosco/training/helpers.py | 143 ++++++++++ sonosco/training/learning_rates.py | 131 +++++++++ sonosco/training/trainer.py | 252 ++++++++++++++++++ tests/test_dataset.py | 8 +- 20 files changed, 708 insertions(+), 75 deletions(-) delete mode 100644 sonosco/datasets/audio_data_loader.py create mode 100644 sonosco/datasets/data_loader.py rename sonosco/datasets/{audio_data_sampler.py => data_sampler.py} (100%) create mode 100644 sonosco/run_training.py delete mode 100644 sonosco/train.py create mode 100644 sonosco/training/abstract_callback.py create mode 100644 sonosco/training/helpers.py create mode 100644 sonosco/training/learning_rates.py create mode 100644 sonosco/training/trainer.py diff --git a/.gitignore b/.gitignore index 453b1ff..e2e1859 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Created by .ignore support plugin (hsz.mobi) sonosco/pycandle/ +experiments/ +tests/test_wavs/ sonosco/pycandle sonosco/experiments/ sonosco/datasets/download_datasets/ diff --git a/sonosco/common/path_utils.py b/sonosco/common/path_utils.py index bf8c8ef..03321bb 100644 --- a/sonosco/common/path_utils.py +++ b/sonosco/common/path_utils.py @@ -1,5 +1,7 @@ import os import wget +import yaml +import codecs def try_create_directory(path: str): @@ -10,3 +12,8 @@ def try_create_directory(path: str): def try_download(destination: str, url: str): if not os.path.exists(destination): wget.download(url, destination) + + +def parse_yaml(file_path: str): + with codecs.open(file_path, "r", "utf-8") as file: + return yaml.load(file) diff --git a/sonosco/common/utils.py b/sonosco/common/utils.py index 4dcbaba..d5b8d25 100644 --- a/sonosco/common/utils.py +++ b/sonosco/common/utils.py @@ -10,16 +10,10 @@ def setup_logging(logger: logging.Logger, filename=None, verbosity=False): logger.setLevel(logging.DEBUG) + if filename is not None: - log_directory = os.path.dirname(filename) - if not os.path.exists(log_directory): - os.makedirs(log_directory) - filename = os.path.join(log_directory, f"{filename}.log") - f_handler = logging.FileHandler(filename=filename, mode="w") - f_handler.setLevel(logging.DEBUG) - f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - f_handler.setFormatter(f_format) - logger.addHandler(f_handler) + add_log_file(filename, logger) + c_handler = logging.StreamHandler() c_handler.setLevel(logging.DEBUG) if verbosity else c_handler.setLevel(logging.INFO) c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') @@ -27,6 +21,18 @@ def setup_logging(logger: logging.Logger, filename=None, verbosity=False): logger.addHandler(c_handler) +def add_log_file(filename: str, logger: logging.Logger): + log_directory = os.path.dirname(filename) + if not os.path.exists(log_directory): + os.makedirs(log_directory) + filename = os.path.join(log_directory, f"{filename}.log") + f_handler = logging.FileHandler(filename=filename, mode="w") + f_handler.setLevel(logging.DEBUG) + f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + f_handler.setFormatter(f_format) + logger.addHandler(f_handler) + + def random_float(low: float, high: float): return np.random.random() * (high - low) + low diff --git a/sonosco/config/train.yaml b/sonosco/config/train.yaml index 25be72c..3e9cba9 100644 --- a/sonosco/config/train.yaml +++ b/sonosco/config/train.yaml @@ -16,6 +16,7 @@ train: hidden_size: 800 # Hidden size of RNNs hidden_layers: 5 # Number of RNN layers rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported + labels: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' # labels used by the model max_epochs: 70 # Number of training epochs learning_rate: 3e-4 # Initial learning rate diff --git a/sonosco/datasets/__init__.py b/sonosco/datasets/__init__.py index e69de29..9cfa7a3 100644 --- a/sonosco/datasets/__init__.py +++ b/sonosco/datasets/__init__.py @@ -0,0 +1,3 @@ +from .audio_dataset import AudioDataProcessor, AudioDataset +from .data_sampler import BucketingSampler +from .data_loader import AudioDataLoader, create_data_loaders diff --git a/sonosco/datasets/audio_data_loader.py b/sonosco/datasets/audio_data_loader.py deleted file mode 100644 index 967b89c..0000000 --- a/sonosco/datasets/audio_data_loader.py +++ /dev/null @@ -1,33 +0,0 @@ -import numpy as np -import torch - -from torch.utils.data import Dataset, DataLoader, Sampler - - -class AudioDataLoader(DataLoader): - - def __init__(self, *args, **kwargs): - """Creates a data loader for AudioDatasets.""" - super(AudioDataLoader, self).__init__(*args, **kwargs) - self.collate_fn = self._collate_fn - - # TODO: Optimise - def _collate_fn(self, batch): - batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) - longest_sample = batch[0][0] - freq_size, max_seqlength = longest_sample.size() - minibatch_size = len(batch) - inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) - input_percentages = torch.FloatTensor(minibatch_size) - target_sizes = np.zeros(minibatch_size, dtype=np.int32) - - # TODO: Numpy broadcasting magic - targets = [] - - for x in range(minibatch_size): - inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) - input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) - target_sizes[x] = len(batch[x][1]) - targets.extend(batch[x][1]) - - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) \ No newline at end of file diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py index 0dea305..c82e697 100644 --- a/sonosco/datasets/audio_dataset.py +++ b/sonosco/datasets/audio_dataset.py @@ -22,7 +22,7 @@ MAX_SHIFT = 4000 -class DataProcessor: +class AudioDataProcessor: def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False): """ @@ -104,7 +104,7 @@ def parse_transcript(self, transcript_path): class AudioDataset(Dataset): - def __init__(self, processor: DataProcessor, manifest_filepath): + def __init__(self, processor: AudioDataProcessor, manifest_filepath): """ Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by a comma. Each new line is a different sample. Example below: diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/data_loader.py new file mode 100644 index 0000000..b9f1115 --- /dev/null +++ b/sonosco/datasets/data_loader.py @@ -0,0 +1,59 @@ +import numpy as np +import logging +import torch + +from torch.utils.data import Dataset, DataLoader, Sampler +from .audio_dataset import AudioDataProcessor, AudioDataset +from .data_sampler import BucketingSampler + + +LOGGER = logging.getLogger(__name__) + + +class AudioDataLoader(DataLoader): + + def __init__(self, *args, **kwargs): + """Creates a data loader for AudioDatasets.""" + super(AudioDataLoader, self).__init__(*args, **kwargs) + self.collate_fn = self._collate_fn + + # TODO: Optimise + def _collate_fn(self, batch): + batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) + longest_sample = batch[0][0] + freq_size, max_seqlength = longest_sample.size() + minibatch_size = len(batch) + inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) + input_percentages = torch.FloatTensor(minibatch_size) + target_sizes = np.zeros(minibatch_size, dtype=np.int32) + + # TODO: Numpy broadcasting magic + targets = [] + + for x in range(minibatch_size): + inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) + input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) + target_sizes[x] = len(batch[x][1]) + targets.extend(batch[x][1]) + + return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) + + +def create_data_loaders(train_manifest, val_manifest, batch_size, num_data_workers, **kwargs): + processor = AudioDataProcessor(**kwargs) + + # create train loader + train_dataset = AudioDataset(processor, manifest_filepath=train_manifest) + LOGGER.info(f"Training dataset containing {len(train_dataset)} samples is created") + sampler = BucketingSampler(train_dataset, batch_size=batch_size) + train_loader = AudioDataLoader(dataset=train_dataset, num_workers=num_data_workers, batch_sampler=sampler) + LOGGER.info("Training data loader created.") + + # create validation loader + val_dataset = AudioDataset(processor, manifest_filepath=val_manifest) + LOGGER.info(f"Validation dataset containing {len(val_dataset)} samples is created") + sampler = BucketingSampler(val_dataset, batch_size=batch_size) + val_loader = AudioDataLoader(dataset=val_dataset, num_workers=num_data_workers, batch_sampler=sampler) + LOGGER.info("Validation data loader created.") + + return train_loader, val_loader diff --git a/sonosco/datasets/audio_data_sampler.py b/sonosco/datasets/data_sampler.py similarity index 100% rename from sonosco/datasets/audio_data_sampler.py rename to sonosco/datasets/data_sampler.py diff --git a/sonosco/datasets/download_datasets/__init__.py b/sonosco/datasets/download_datasets/__init__.py index e69de29..2ef1650 100644 --- a/sonosco/datasets/download_datasets/__init__.py +++ b/sonosco/datasets/download_datasets/__init__.py @@ -0,0 +1,3 @@ +def download_all_datasets(path: str): + """Downloads all datasets which are missing.""" + pass diff --git a/sonosco/models/__init__.py b/sonosco/models/__init__.py index e69de29..eaab54d 100644 --- a/sonosco/models/__init__.py +++ b/sonosco/models/__init__.py @@ -0,0 +1 @@ +from .deepspeech2 import DeepSpeech2 \ No newline at end of file diff --git a/sonosco/run_training.py b/sonosco/run_training.py new file mode 100644 index 0000000..62222f7 --- /dev/null +++ b/sonosco/run_training.py @@ -0,0 +1,39 @@ +import logging +import click +import torch.nn.functional as torch_functional + +from sonosco.common.constants import SONOSCO +from sonosco.common.utils import setup_logging +from sonosco.common.path_utils import parse_yaml +from sonosco.training import Experiment, ModelTrainer +from sonosco.datasets import create_data_loaders +from sonosco.models import DeepSpeech2 + +LOGGER = logging.getLogger(SONOSCO) + + +@click.command() +@click.option("-e", "--experiment_name", default="default", type=click.STRING, help="Experiment name.") +@click.option("-c", "--config_path", default="config/train.yaml", type=click.Path, help="Path to train configurations.") +def main(experiment_name, config_path): + Experiment.create(experiment_name) + config = parse_yaml(config_path) + + train_loader, val_loader = create_data_loaders(**config) + + # TODO: change to load different models dynamically + model = DeepSpeech2() + + trainer = ModelTrainer(model, loss=torch_functional.ctc_loss, epochs=config["max_epochs"], + train_data_loader=train_loader, val_data_loader=val_loader, + lr=config["learning_rate"]) + + try: + trainer.start_training() + except KeyboardInterrupt: + trainer.stop_training() + + +if __name__ == '__main__': + setup_logging(LOGGER) + main() diff --git a/sonosco/train.py b/sonosco/train.py deleted file mode 100644 index a763a47..0000000 --- a/sonosco/train.py +++ /dev/null @@ -1,18 +0,0 @@ -import argparse -from typing import Dict - -import yaml - -from modelwrapper import ModelWrapper - -parser = argparse.ArgumentParser(description='ASR training') -parser.add_argument('--config', metavar='DIR', - help='Path to train config file', default='config/train.yaml') - -if __name__ == '__main__': - args = parser.parse_args() - with open(args.config, 'r') as file: - config = yaml.load(file) - config_dict: Dict = config["train"] - model = ModelWrapper(**config_dict) - model.train() diff --git a/sonosco/training/__init__.py b/sonosco/training/__init__.py index e69de29..05caba2 100644 --- a/sonosco/training/__init__.py +++ b/sonosco/training/__init__.py @@ -0,0 +1,2 @@ +from .experiment import Experiment +from .trainer import ModelTrainer diff --git a/sonosco/training/abstract_callback.py b/sonosco/training/abstract_callback.py new file mode 100644 index 0000000..8d920b6 --- /dev/null +++ b/sonosco/training/abstract_callback.py @@ -0,0 +1,25 @@ +from abc import ABC, abstractmethod + + +class AbstractCallback(ABC): + """ + Interface that defines how callbacks must be specified. + """ + + @abstractmethod + def __call__(self, epoch, step, performance_measures, context): + """ + Called after every batch by the ModelTrainer. + Parameters: + epoch (int): current epoch number + step (int): current batch number + performance_measures (dict): losses and metrics based on a running average + context (ModelTrainer): reference to the calling ModelTrainer, allows to access members + """ + pass + + def close(self): + """ + Handle cleanup work if necessary. Will be called at the end of the last epoch. + """ + pass diff --git a/sonosco/training/experiment.py b/sonosco/training/experiment.py index 23f46e0..07fd1cf 100644 --- a/sonosco/training/experiment.py +++ b/sonosco/training/experiment.py @@ -1,11 +1,14 @@ import os import os.path as path -import sys import datetime +import logging import sonosco.common.path_utils as path_utils +import sonosco.common.utils as utils from time import time -from sonosco.common.utils import copy_code + + +LOGGER = logging.getLogger(__name__) class Experiment: @@ -33,23 +36,23 @@ def __init__(self, self.name = self._set_experiment_name(experiment_name) self.path = path.join(self.experiments_path, self.name) # path to current experiment self.logs = path.join(self.experiments_path, "logs") + self.code = path.join(self.experiments_path, "code") self._sub_directories = sub_directories self._exclude_dirs = exclude_dirs - self._exclude_dirs.extend(exclude_dirs) self._exclude_files = exclude_files - self._exclude_files.extend(exclude_files) self._init_directories() self._copy_sourcecode() + self._set_logging() @staticmethod def _set_experiments_dir(experiments_path): if experiments_path is not None: return experiments_path - local_path = os.path.dirname(sys.argv[0]) + local_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) local_path = local_path if local_path != '' else './' return path.join(local_path, "experiments") @@ -58,6 +61,9 @@ def _set_experiment_name(experiment_name): date_time = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H:%M:%S') return f"{date_time}_{experiment_name}" + def _set_logging(self): + utils.add_log_file(self.logs, LOGGER) + def _init_directories(self): """ Create all basic directories. """ path_utils.try_create_directory(self.experiments_path) @@ -71,11 +77,11 @@ def _add_member(self, key, value): def _copy_sourcecode(self): """ Copy code from execution directory in experiment code directory. """ - sources_path = os.path.dirname(sys.argv[0]) + sources_path = os.path.dirname(os.path.dirname(__file__)) sources_path = sources_path if sources_path != '' else './' - copy_code(sources_path, self.code, - exclude_dirs=self._exclude_dirs, - exclude_files=self._exclude_files) + utils.copy_code(sources_path, self.code, + exclude_dirs=self._exclude_dirs, + exclude_files=self._exclude_files) def add_directory(self, dir_name): """ @@ -96,3 +102,7 @@ def add_file(folder_path, filename, content): """ Adds a file with provided content to folder. Convenience function. """ with open(path.join(folder_path, filename), 'w') as text_file: text_file.write(content) + + @staticmethod + def create(name: str): + return Experiment(name) diff --git a/sonosco/training/helpers.py b/sonosco/training/helpers.py new file mode 100644 index 0000000..893c14d --- /dev/null +++ b/sonosco/training/helpers.py @@ -0,0 +1,143 @@ +import logging +import sys +import os.path as path +import numpy as np +import torch + +from collections import defaultdict +from .abstract_callback import AbstractCallback + + +LOGGER = logging.getLogger(__name__) + + +class HistoryRecorder(AbstractCallback): + """ Records all losses and metrics during training. """ + + def __init__(self, epoch_steps): + self.history = defaultdict(list) + self._epoch_steps = epoch_steps + + def __call__(self, epoch, step, performance_measures, context): + + if step % self._epoch_steps == 0: # only record at end of epoch + return + + for key, value in performance_measures.items(): + if type(value) == torch.Tensor: + value = value.item() + self.history[key].append(value) + + +class ModelCheckpoint(AbstractCallback): + """ + Saves the model and optimizer state at the point with lowest validation error throughout training. + Args: + output_path (string): path to directory where the checkpoint will be saved to + model_name (string): name of the checkpoint file + """ + + def __init__(self, output_path, model_name='model_checkpoint.pt'): + self.output_path = path.join(output_path, model_name) + self.best_val_score = sys.float_info.max + + def __call__(self, epoch, step, performance_measures, context): + + if 'val_loss' not in performance_measures: + return + + if performance_measures['val_loss'] < self.best_val_score: + self.best_val_score = performance_measures['val_loss'] + self._save_checkpoint(context.model, context.optimizer, epoch) + + def _save_checkpoint(self, model, optimizer, epoch): + LOGGER.info("Saving model at checkpoint.") + model.eval() + model_state_dict = model.state_dict() + optimizer_state_dict = optimizer.state_dict() + torch.save({'arch': model.__class__.__name__, + 'epoch': epoch, + 'model_state_dict': model_state_dict, + 'optimizer_state_dict': optimizer_state_dict + }, self.output_path) + model.train() + + +class LayerwiseGradientNorm(AbstractCallback): + """ Collects the layer-wise gradient norms for each epoch. """ + + def __init__(self): + self.layer_grads = dict() + self._batch_layer_grads = dict() + + def __call__(self, epoch, step, performance_measures, context): + """ + Store gradient norms for each batch and compute means after the + epoch's last batch. + """ + self._store_batch_layer_grads(context.model) + + if step == (len(context.train_data_loader) - 1): # end of epoch + self._store_layer_grads() + self._batch_layer_grads = dict() + + def _store_batch_layer_grads(self, model): + """ Store gradient norm of each layer for current batch. """ + for name, param in model.named_parameters(): + + if not param.requires_grad or param.grad is None: + continue + + if not name in self._batch_layer_grads: + self._batch_layer_grads[name] = [] + + grad_norm = torch.sqrt(torch.sum(param.grad**2)).item() + self._batch_layer_grads[name].append(grad_norm) + + def _store_layer_grads(self): + """ Compute mean of all batch steps in epoch. """ + for name, grads in self._batch_layer_grads.items(): + + if name not in self.layer_grads: + self.layer_grads[name] = [] + + layer_epoch_grad = np.mean(grads) + self.layer_grads[name].append(layer_epoch_grad) + + +class EarlyStopping(AbstractCallback): + """ + Early Stopping to terminate training early if the monitored metric did not improve + over a number of epochs. + Args: + monitor (string): name of the relevant loss or metric (usually 'val_loss') + min_delta (float): minimum change in monitored metric to qualify as an improvement + patience (int): number of epochs to wait for an improvement before terminating the training + """ + + def __init__(self, monitor='val_loss', min_delta=0, patience=5): + self.monitor = monitor + self.min_delta = min_delta + self.patience = patience + self.last_best = sys.float_info.max + self.counter = 0 + self.stopped_epoch = 0 + + def __call__(self, epoch, step, performance_measures, context): + + if step != len(context.train_data_loader) - 1: # only continue at end of epoch + return + + if self.monitor not in performance_measures: + return + + current_loss = performance_measures[self.monitor] + if (self.last_best - current_loss) >= self.min_delta: + self.last_best = current_loss + self.counter = 0 + else: + self.counter += 1 + + if self.counter >= self.patience: + context._stop_training = True # make ModelTrainer stop + LOGGER.info(f"Early stopping after epoch {epoch}") diff --git a/sonosco/training/learning_rates.py b/sonosco/training/learning_rates.py new file mode 100644 index 0000000..e977514 --- /dev/null +++ b/sonosco/training/learning_rates.py @@ -0,0 +1,131 @@ +import logging +import sys + +from .abstract_callback import AbstractCallback + + +LOGGER = logging.getLogger(__name__) + + +class StepwiseLearningRateReduction(AbstractCallback): + """ + Reduces the learning rate of the optimizer every N epochs. + Args: + epoch_steps (int): number of epochs after which learning rate is reduced + reduction_factor (float): multiplicative factor for learning rate reduction + min_lr (float): lower bound for learning rate + """ + + def __init__(self, epoch_steps, reduction_factor, min_lr=None): + self._epoch_steps = epoch_steps + self._reduction_factor = reduction_factor + self._min_lr = min_lr + + def __call__(self, epoch, step, performance_measures, context): + # execute at the beginning of every Nth epoch + if epoch > 0 and step == 0 and epoch % self._epoch_steps == 0: + + # reduce lr for each param group (necessary for e.g. Adam) + for param_group in context.optimizer.param_groups: + new_lr = param_group['lr'] * self._reduction_factor + + if self._min_lr is not None and new_lr < self._min_lr: + continue + + param_group['lr'] = new_lr + LOGGER.info("Epoch {}: Reducing learning rate to {}".format(epoch, new_lr)) + + +class ScheduledLearningRateReduction(AbstractCallback): + """ + Reduces the learning rate of the optimizer for every scheduled epoch. + Args: + epoch_schedule (list of int): defines at which epoch the learning rate will be reduced + reduction_factor (float): multiplicative factor for learning rate reduction + min_lr (float): lower bound for learning rate + """ + + def __init__(self, epoch_schedule, reduction_factor, min_lr=None): + self._epoch_schedule = sorted(epoch_schedule) + self._reduction_factor = reduction_factor + self._min_lr = min_lr + + def __call__(self, epoch, step, performance_measures, context): + + if not self._epoch_schedule: # stop if schedule is empty + return + + next_epoch_step = self._epoch_schedule[0] + if epoch >= next_epoch_step and step == 0: + + # reduce lr for each param group (necessary for e.g. Adam) + for param_group in context.optimizer.param_groups: + new_lr = param_group['lr'] * self._reduction_factor + + if self._min_lr is not None and new_lr < self._min_lr: + continue + + param_group['lr'] = new_lr + LOGGER.info("Epoch {}: Reducing learning rate to {}".format(epoch, new_lr)) + + self._epoch_schedule.pop(0) + + +class ReduceLROnPlateau(AbstractCallback): + """ + Reduce the learning rate if the train or validation loss plateaus. + Args: + monitor (string): name of the relevant loss or metric (usually 'val_loss') + factor (float): factor by which the lr is decreased at each step + patience (int): number of epochs to wait on plateau for loss improvement before reducing lr + min_delta (float): minimum improvement necessary to reset patience + cooldown (int): number of epochs to cooldown after a lr reduction + min_lr (float): minimum value the learning rate can decrease to + verbose (bool): print to console + """ + + def __init__(self, monitor='val_loss', factor=0.1, patience=10, min_delta=0, cooldown=0, min_lr=0, verbose=False): + self.monitor = monitor + if factor >= 1.0 or factor < 0: + raise ValueError('ReduceLROnPlateau does only support a factor in [0,1[.') + self.factor = factor + self.min_lr = min_lr + self.min_delta = min_delta + self.patience = patience + self.verbose = verbose + self.cooldown = cooldown + self.cooldown_counter = 0 + self.wait = 0 + self.best_loss = sys.float_info.max + + def __call__(self, epoch, step, performance_measures, context): + + if self.monitor not in performance_measures: + return + + if step != len(context.train_data_loader)-1: # only continue at end of epoch + return + + if self.cooldown_counter > 0: # in cooldown phase + self.cooldown_counter -= 1 + self.wait = 0 + + current_loss = performance_measures[self.monitor] + if (self.best_loss - current_loss) >= self.min_delta: # loss improved, save and reset wait counter + self.best_loss = current_loss + self.wait = 0 + + elif self.cooldown_counter <= 0: # no improvement and not in cooldown + + if self.wait >= self.patience: # waited long enough, reduce lr + for param_group in context.optimizer.param_groups: + old_lr = param_group['lr'] + new_lr = old_lr * self.factor + if new_lr >= self.min_lr: # only decrease if there is still enough buffer space + if self.verbose: + LOGGER.info("Epoch {}: Reducing learning rate from {} to {}".format(epoch, old_lr, new_lr)) #TODO print per param group? + param_group['lr'] = new_lr + self.cooldown_counter = self.cooldown # new cooldown phase after lr reduction + self.wait = 0 + else: + self.wait += 1 diff --git a/sonosco/training/trainer.py b/sonosco/training/trainer.py new file mode 100644 index 0000000..ba438fa --- /dev/null +++ b/sonosco/training/trainer.py @@ -0,0 +1,252 @@ +import logging +import torch +import torch.optim.optimizer +import torch.nn.utils.clip_grad as grads + +from collections import defaultdict +from typing import Callable, Union, Tuple, List, Any +from torch.utils.data import DataLoader +from .abstract_callback import AbstractCallback + + +LOGGER = logging.getLogger(__name__) + + +class ModelTrainer: + """ + This class handles the training of a pytorch model. It provides convenience + functionality to add metrics and callbacks and is inspired by the keras API. + Args: + model (nn.Module): model to be trained + optimizer (optim.Optimizer): optimizer used for training, e.g. torch.optim.Adam + loss (function): loss function that either accepts (model_output, label) or (input, label, model) if custom_model_eval is true + epochs (int): epochs to train + train_data_loader (utils.data.DataLoader): training data + val_data_loader (utils.data.DataLoader, optional): validation data + custom_model_eval (boolean, optional): enables training mode where the model is evaluated in the loss function + gpu (int, optional): if not set training runs on cpu, otherwise an int is expected that determines the training gpu + clip_grads (float, optional): if set training gradients will be clipped at specified norm + Example: + >>> model_trainer = ModelTrainer(model, optimizer, F.nll_loss, num_epochs, train_loader, gpu=0) + >>> model_trainer.start_training() + """ + + def __init__(self, + model: torch.nn.Module, + loss: Union[Callable[[torch.Tensor, torch.Tensor], float], + Callable[[torch.Tensor, torch.Tensor, torch.nn.Module], float]], + epochs: int, + train_data_loader: DataLoader, + val_data_loader: DataLoader = None, + optimizer=torch.optim.Adam, + lr: float = 1e-4, + custom_model_eval: bool = False, + gpu: int = None, + clip_grads: float = None, + metrics: List[Callable[[torch.Tensor, Any], Union[float, torch.Tensor]]] = None, + callbacks: List[AbstractCallback] = None): + + self.model = model + self.train_data_loader = train_data_loader + self.val_data_loader = val_data_loader + self.optimizer = optimizer(self.model.parameters(), lr=lr) + self.loss = loss + self._epochs = epochs + self._metrics = metrics if metrics is not None else list() + self._callbacks = callbacks if callbacks is not None else list() + self._gpu = gpu + self._custom_model_eval = custom_model_eval + self._clip_grads = clip_grads + self._stop_training = False # used stop training externally + + def set_metrics(self, metrics): + """ + Set metric functions that receive y_pred and y_true. Metrics are expected to return + a basic numeric type like float or int. + """ + self._metrics = metrics + + def add_metric(self, metric): + self._metrics.append(metric) + + def set_callbacks(self, callbacks): + """ + Set callbacks that are callable functionals and receive epoch, step, loss, context. + Context is a pointer to the ModelTrainer instance. Callbacks are called after each + processed batch. + """ + self._callbacks = callbacks + + def add_callback(self, callback): + self._callbacks.append(callback) + + def start_training(self): + self.model.train() # train mode + for epoch in range(1, self._epochs + 1): + self._epoch_step(epoch) + + if self._stop_training: + break + + self._close_callbacks() + + def _epoch_step(self, epoch): + """ Execute one training epoch. """ + running_batch_loss = 0 + running_metrics = defaultdict(float) + + for step, batch in enumerate(self.train_data_loader): + batch = self._recursive_to_cuda(batch) # move to GPU + + # compute training batch + loss, model_output, grad_norm = self._train_on_batch(batch) + running_batch_loss += loss.item() + + # compute metrics + self._compute_running_metrics(model_output, batch, running_metrics) + running_metrics['gradient_norm'] += grad_norm # add grad norm to metrics + + # evaluate validation set at end of epoch + if self.val_data_loader and step == (len(self.train_data_loader) - 1): + self._compute_validation_error(running_metrics) + + # print current loss and metrics and provide it to callbacks + performance_measures = self._construct_performance_dict(step, running_batch_loss, running_metrics) + self._print_step_info(epoch, step, performance_measures) + self._apply_callbacks(epoch, step, performance_measures) + + def stop_training(self): + self._stop_training = True + + def _comp_gradients(self): + """ Compute the gradient norm for all model parameters. """ + grad_sum = 0 + for param in self.model.parameters(): + if param.requires_grad and param.grad is not None: + grad_sum += torch.sum(param.grad ** 2) + grad_norm = torch.sqrt(grad_sum).item() + return grad_norm + + def _train_on_batch(self, batch): + """ Compute loss depending on settings, compute gradients and apply optimization step. """ + # evaluate loss + batch_x, batch_y = batch + if self._custom_model_eval: + loss, model_output = self.loss(batch, self.model) + else: + model_output = self.model(batch_x) + loss = self.loss(model_output, batch_y) + + self.optimizer.zero_grad() # reset gradients + loss.backward() # backpropagation + + # gradient clipping + if self._clip_grads is not None: + grads.clip_grad_norm(self.model.parameters(), self._clip_grads) + + grad_norm = self._comp_gradients() # compute average gradient norm + + self.optimizer.step() # apply optimization step + return loss, model_output, grad_norm + + def _compute_validation_error(self, running_metrics): + """ Evaluate the model's validation error. """ + running_val_loss = 0 + + self.model.eval() + for batch in self.val_data_loader: + batch = self._recursive_to_cuda(batch) + + # evaluate loss + batch_x, batch_y = batch + if self._custom_model_eval: # e.g. used for sequences and other complex model evaluations + val_loss, model_output = self.loss(batch, self.model) + else: + model_output = self.model(batch_x) + val_loss = self.loss(model_output, batch_y) + + # compute running validation loss and metrics. add 'val_' prefix to all measures. + running_val_loss += val_loss.item() + self._compute_running_metrics(model_output, batch, running_metrics, prefix='val_') + self.model.train() + + # add loss to metrics and normalize all validation measures + running_metrics['val_loss'] = running_val_loss + for key, value in running_metrics.items(): + if 'val_' not in key: + continue + running_metrics[key] = value / len(self.val_data_loader) + + def _compute_running_metrics(self, + y_pred: torch.Tensor, + batch: Tuple[torch.Tensor, torch.Tensor], + running_metrics: dict, + prefix: str = ''): + """ + Computes all metrics based on predictions and batches and adds them to the metrics + dictionary. Allows to prepend a prefix to the metric names in the dictionary. + """ + for metric in self._metrics: + if self._custom_model_eval: + metric_result = metric(y_pred, batch) + else: + batch_y = batch[1] + metric_result = metric(y_pred, batch_y) + + # convert to float if metric returned tensor + if type(metric_result) == torch.Tensor: + metric_result = metric_result.item() + + running_metrics[prefix + metric.__name__] += metric_result + + def _construct_performance_dict(self, train_step, running_batch_loss, running_metrics): + """ + Constructs a combined dictionary of losses and metrics for callbacks based on + the current running averages. + """ + performance_dict = defaultdict() + for key, value in running_metrics.items(): + if 'val_' not in key: + performance_dict[key] = value / (train_step + 1.) + else: + performance_dict[key] = value # validation metrics, already normalized + + performance_dict['loss'] = running_batch_loss / (train_step + 1.) + return performance_dict + + def _apply_callbacks(self, epoch, step, performance_measures): + """ Call all registered callbacks with current batch information. """ + for callback in self._callbacks: + callback(epoch, step, performance_measures, self) + + def _close_callbacks(self): + """ Signal callbacks training is finished. """ + for callback in self._callbacks: + callback.close() + + def _print_step_info(self, epoch, step, performance_measures): + """ Print running averages for loss and metrics during training. """ + output_message = "epoch {} batch {}/{}".format(epoch, step, len(self.train_data_loader) - 1) + delim = " " + for metric_name in sorted(list(performance_measures.keys())): + if metric_name == 'gradient_norm': + continue + output_message += delim + "{}: {:.6f}".format(metric_name, performance_measures[metric_name]) + LOGGER.info(output_message) + + def _recursive_to_cuda(self, tensors): + """ + Recursively iterates nested lists in depth-first order and transfers all tensors + to specified cuda device. + Parameters: + tensors (list or Tensor): list of tensors or tensor tuples, can be nested + """ + if self._gpu is None: # keep on cpu + return tensors + + if type(tensors) != list: # not only for torch.Tensor + return tensors.to(device=self._gpu) + + for i in range(len(tensors)): + tensors[i] = self._recursive_to_cuda(tensors[i]) + return tensors diff --git a/tests/test_dataset.py b/tests/test_dataset.py index a80c5c9..f895354 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -6,9 +6,9 @@ from sonosco.common.constants import SONOSCO from sonosco.common.utils import setup_logging -from sonosco.datasets.audio_dataset import AudioDataset, DataProcessor -from sonosco.datasets.audio_data_sampler import BucketingSampler -from sonosco.datasets.audio_data_loader import DataLoader +from sonosco.datasets.audio_dataset import AudioDataset, AudioDataProcessor +from sonosco.datasets.data_sampler import BucketingSampler +from sonosco.datasets.data_loader import DataLoader from sonosco.datasets.download_datasets.librispeech import try_download_librispeech @@ -44,7 +44,7 @@ def test_librispeech_clean(logger): # create data processor audio_conf = dict(sample_rate=SAMPLE_RATE, window_size=.02, window_stride=.01, labels='ABCDEFGHIJKLMNOPQRSTUVWXYZ', normalize=True, augment=True) - processor = DataProcessor(**audio_conf) + processor = AudioDataProcessor(**audio_conf) # get manifest file manifest_directory = os.path.join(os.path.expanduser("~"), LIBRI_SPEECH_DIR) From 7d6a3d598916962a5f5680ac00ea57af8178cecc Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Wed, 19 Jun 2019 17:49:11 +0200 Subject: [PATCH 46/58] add travis ci --- .travis.yaml | 11 +++++++++++ .../install_dependencies.sh | 0 2 files changed, 11 insertions(+) create mode 100644 .travis.yaml rename install_dependencies.sh => scripts/install_dependencies.sh (100%) diff --git a/.travis.yaml b/.travis.yaml new file mode 100644 index 0000000..c1c8b03 --- /dev/null +++ b/.travis.yaml @@ -0,0 +1,11 @@ +language: python + +python: + - "3.6" + +install: + - bash scripts/install_dependencies.sh + - pip install -e . + +script: + - pytest diff --git a/install_dependencies.sh b/scripts/install_dependencies.sh similarity index 100% rename from install_dependencies.sh rename to scripts/install_dependencies.sh From f18f303085b1d3cff2e3c75ce6902666a8106417 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Wed, 19 Jun 2019 19:02:43 +0200 Subject: [PATCH 47/58] update run_trainer --- sonosco/config/train.yaml | 4 +-- sonosco/config/train_librispeech.yaml | 49 +++++++++++++++++++++++++++ sonosco/datasets/audio_dataset.py | 12 ++----- sonosco/datasets/data_loader.py | 14 ++++---- sonosco/run_training.py | 5 +-- sonosco/training/trainer.py | 1 + 6 files changed, 65 insertions(+), 20 deletions(-) create mode 100644 sonosco/config/train_librispeech.yaml diff --git a/sonosco/config/train.yaml b/sonosco/config/train.yaml index 3e9cba9..3e8e364 100644 --- a/sonosco/config/train.yaml +++ b/sonosco/config/train.yaml @@ -3,7 +3,7 @@ train: val_manifest: 'examples/manifests/val_manifest.csv' labels_path: 'examples/labels.json' # Contains all characters for transcription log_dir: 'logs' # Location for log files - def_dir: 'examples/checkpoints/', # Default location to save/load models + def_dir: 'examples/checkpoints/' # Default location to save/load models load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune @@ -19,7 +19,7 @@ train: labels: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' # labels used by the model max_epochs: 70 # Number of training epochs - learning_rate: 3e-4 # Initial learning rate + learning_rate: 3.0e-4 # Initial learning rate momentum: 0.9 # Momentum max_norm: 800 # Norm cutoff to prevent explosion of gradients learning_anneal: 1.1n # Annealing applied to learning rate every epoch diff --git a/sonosco/config/train_librispeech.yaml b/sonosco/config/train_librispeech.yaml new file mode 100644 index 0000000..eba936b --- /dev/null +++ b/sonosco/config/train_librispeech.yaml @@ -0,0 +1,49 @@ +train: + train_manifest: '/Users/yuriy/temp/data/libri_speech/libri_test_clean_manifest.csv' + val_manifest: '/Users/yuriy/temp/data/libri_speech/libri_test_clean_manifest.csv' + log_dir: 'logs' # Location for log files + def_dir: 'examples/checkpoints/' # Default location to save/load models + + load_from: 'asr_final.pth' # File name containing a checkpoint to continue/finetune + + sample_rate: 16000 # Sample rate + window_size: 0.02 # Window size for spectrogram in seconds + window_stride: 0.01 # Window stride for spectrogram in seconds + window: 'hamming' # Window type for spectrogram generation + + batch_size: 32 # Batch size for training + hidden_size: 800 # Hidden size of RNNs + hidden_layers: 5 # Number of RNN layers + rnn_type: 'gru' # Type of the RNN unit: gru|lstm are supported + labels: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' # labels used by the model + + max_epochs: 70 # Number of training epochs + learning_rate: 3.0e-4 # Initial learning rate + momentum: 0.9 # Momentum + max_norm: 800 # Norm cutoff to prevent explosion of gradients + learning_anneal: 1.1n # Annealing applied to learning rate every epoch + sortaGrad: True # Turn on ordering of dataset on sequence length for the first epoch + + checkpoint: True # Enables checkpoint saving of model + checkpoint_per_epoch: 1 # Save checkpoint per x epochs + silent: False # Turn on progress tracking per iteration + verbose: False # Turn on verbose progress tracking + continue: False # Continue training with a pre-trained model + finetune: False # Finetune a pre-trained model + + num_data_workers: 8 # Number of workers used in data-loading + augment: False # Use random tempo and gain perturbations + shuffle: True # Turn on shuffling and sample from dataset based on sequence length (smallest to largest) + + seed: 123456 # Seed to generators + cuda: True # Use cuda to train model + half_precision: Trues # Uses half precision to train a model + apex: True # Uses mixed precision to train a model + static_loss_scaling: False # Static loss scale for mixed precision + dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision + + dist_url: 'tcp://127.0.0.1:1550' # URL used to set up distributed training + dist_backend: 'nccl' # Distributed backend + world_size: 1 # Number of distributed processes + rank: 0 # The rank of the current process + gpu_rank: 0 # If using distributed parallel for multi_gpu, sets the GPU for the process \ No newline at end of file diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py index c82e697..74683f0 100644 --- a/sonosco/datasets/audio_dataset.py +++ b/sonosco/datasets/audio_dataset.py @@ -24,7 +24,7 @@ class AudioDataProcessor: - def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False): + def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False, **kwargs): """ Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by a comma. Each new line is a different sample. Example below: @@ -76,13 +76,8 @@ def parse_audio(self, audio_path, raw=False): if raw: return sound - sound_tensor = torch.from_numpy(sound) - - if global_settings.CUDA_ENABLED: - sound_tensor = sound_tensor.cuda() - # TODO: comment why take the last element? - complex_spectrogram = librosa.stft(sound_tensor, + complex_spectrogram = librosa.stft(sound, n_fft=self.window_size_samples, hop_length=self.window_stride_samples, win_length=self.window_size_samples) @@ -95,10 +90,9 @@ def parse_audio(self, audio_path, raw=False): def parse_transcript(self, transcript_path): with open(transcript_path, 'r', encoding='utf8') as transcript_file: transcript = transcript_file.read().replace('\n', '') - LOGGER.info(f"1: {transcript}") # TODO: Is it fast enough? transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) - LOGGER.info(f"transcript_path: {transcript_path} transcript: {transcript}") + LOGGER.debug(f"transcript_path: {transcript_path} transcript: {transcript}") return transcript diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/data_loader.py index b9f1115..74912d6 100644 --- a/sonosco/datasets/data_loader.py +++ b/sonosco/datasets/data_loader.py @@ -39,21 +39,21 @@ def _collate_fn(self, batch): return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) -def create_data_loaders(train_manifest, val_manifest, batch_size, num_data_workers, **kwargs): +def create_data_loaders(**kwargs): processor = AudioDataProcessor(**kwargs) # create train loader - train_dataset = AudioDataset(processor, manifest_filepath=train_manifest) + train_dataset = AudioDataset(processor, manifest_filepath=kwargs["train_manifest"]) LOGGER.info(f"Training dataset containing {len(train_dataset)} samples is created") - sampler = BucketingSampler(train_dataset, batch_size=batch_size) - train_loader = AudioDataLoader(dataset=train_dataset, num_workers=num_data_workers, batch_sampler=sampler) + sampler = BucketingSampler(train_dataset, batch_size=kwargs["batch_size"]) + train_loader = AudioDataLoader(dataset=train_dataset, num_workers=kwargs["num_data_workers"], batch_sampler=sampler) LOGGER.info("Training data loader created.") # create validation loader - val_dataset = AudioDataset(processor, manifest_filepath=val_manifest) + val_dataset = AudioDataset(processor, manifest_filepath=kwargs["val_manifest"]) LOGGER.info(f"Validation dataset containing {len(val_dataset)} samples is created") - sampler = BucketingSampler(val_dataset, batch_size=batch_size) - val_loader = AudioDataLoader(dataset=val_dataset, num_workers=num_data_workers, batch_sampler=sampler) + sampler = BucketingSampler(val_dataset, batch_size=kwargs["batch_size"]) + val_loader = AudioDataLoader(dataset=val_dataset, num_workers=kwargs["num_data_workers"], batch_sampler=sampler) LOGGER.info("Validation data loader created.") return train_loader, val_loader diff --git a/sonosco/run_training.py b/sonosco/run_training.py index 62222f7..b5485e3 100644 --- a/sonosco/run_training.py +++ b/sonosco/run_training.py @@ -14,10 +14,11 @@ @click.command() @click.option("-e", "--experiment_name", default="default", type=click.STRING, help="Experiment name.") -@click.option("-c", "--config_path", default="config/train.yaml", type=click.Path, help="Path to train configurations.") +@click.option("-c", "--config_path", default="config/train.yaml", type=click.STRING, + help="Path to train configurations.") def main(experiment_name, config_path): Experiment.create(experiment_name) - config = parse_yaml(config_path) + config = parse_yaml(config_path)["train"] train_loader, val_loader = create_data_loaders(**config) diff --git a/sonosco/training/trainer.py b/sonosco/training/trainer.py index ba438fa..15841b7 100644 --- a/sonosco/training/trainer.py +++ b/sonosco/training/trainer.py @@ -96,6 +96,7 @@ def _epoch_step(self, epoch): running_metrics = defaultdict(float) for step, batch in enumerate(self.train_data_loader): + import pdb; pdb.set_trace() batch = self._recursive_to_cuda(batch) # move to GPU # compute training batch From 9fc9eef20f6c051fb430031f7c1b37f946c3ab7e Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Fri, 21 Jun 2019 15:03:00 +0200 Subject: [PATCH 48/58] Added serialize decorator --- .gitignore | 2 -- sonosco/common/class_utils.py | 12 +++---- sonosco/model.py | 14 ++++++-- sonosco/serialization.py | 67 +++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 10 deletions(-) create mode 100644 sonosco/serialization.py diff --git a/.gitignore b/.gitignore index 453b1ff..385f777 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,6 @@ sonosco/pycandle/ sonosco/pycandle sonosco/experiments/ -sonosco/datasets/download_datasets/ -!sonosco/datasets/download_datasets/*.py **/.DS_Store diff --git a/sonosco/common/class_utils.py b/sonosco/common/class_utils.py index b73cafa..e4af560 100644 --- a/sonosco/common/class_utils.py +++ b/sonosco/common/class_utils.py @@ -1,23 +1,23 @@ import inspect -from typing import List +from typing import Set -def get_constructor_args(cls: type) -> List[str]: +def get_constructor_args(cls) -> Set[str]: """ E.g. class Bar(): def __init__(self, arg1, arg2): - get_constructor_args(BAR) + get_constructor_args(Bar) # returns ['arg1', 'arg2'] Args: - cls (type): + cls (object): - Returns: list containing names of constructor arguments + Returns: set containing names of constructor arguments """ - return inspect.getfullargspec(cls.__init__).args[1:] + return set(inspect.getfullargspec(cls.__init__).args[1:]) def get_class_by_name(name: str) -> type: diff --git a/sonosco/model.py b/sonosco/model.py index 5bbbdea..6d12aec 100644 --- a/sonosco/model.py +++ b/sonosco/model.py @@ -80,11 +80,20 @@ def __init__(self, arg1, arg2): Args: - model (nn.Module): model to infre from + model (nn.Module): model to infer from Returns (dict): Mapping from __init__ argument to it's value """ + constructor_args = get_constructor_args(model) + model_attributes = model.__dict__ + attributes_names = set(model_attributes.keys()) + + ambiguous_arguments = constructor_args - attributes_names + + if ambiguous_arguments: + LOGGER.warning(f"Some constructor arguments do not have equivalent fields ") + return {} @@ -145,11 +154,12 @@ def load_model(self, cls: type, path: str, deserialize_method_name: str = 'deser package = torch.load(path, map_location=lambda storage, loc: storage) if hasattr(cls, deserialize_method_name) and callable(getattr(cls, deserialize_method_name)): return getattr(cls, deserialize_method_name)(package) - constructor_args = set(get_constructor_args(cls)) + constructor_args = get_constructor_args(cls) stored_keys = set(package.keys()) stored_keys.remove('state_dict') args_to_apply = constructor_args & stored_keys + # If the lengths are not equal it means that there is some inconsistency between save and load if len(args_to_apply) != len(constructor_args): not_in_constructor = stored_keys - constructor_args if not_in_constructor: diff --git a/sonosco/serialization.py b/sonosco/serialization.py new file mode 100644 index 0000000..24b136b --- /dev/null +++ b/sonosco/serialization.py @@ -0,0 +1,67 @@ +from dataclasses import _process_class, _create_fn, _set_new_attribute, fields, is_dataclass +__primitives = {int, float, str, bool} +__iterables = [list, set, tuple] + +def serializable(_cls=None): + """ + + Returns the same class as was passed in, with init and serialize methods. + + + Args: + _cls: + + Returns: + + """ + + def wrap(cls): + cls = _process_class(cls, init=True, repr=False, eq=False, order=False, unsafe_hash=False, frozen=False) + _set_new_attribute(cls, '__serialize__', __add_serialize(cls)) + return cls + + # See if we're being called as @dataclass or @dataclass(). + if _cls is None: + # We're called with parens. + return wrap + + # We're called as @dataclass without parens. + return wrap(_cls) + + +def __add_serialize(cls): + fields_to_serialize = fields(cls) + sonosco_self = ['__sonosco_self__' if 'self' in fields_to_serialize else 'self'] + serialize_body = __create_serialize_body(fields_to_serialize) + return _create_fn('__serialize__', [sonosco_self], [f'return {serialize_body}'], return_type=dict) + + +def __create_serialize_body(fields_to_serialize): + body_lines = ["{"] + for field in fields_to_serialize: + if __is_primitive(field) or __is_iterable_of_primitives(field): + body_lines.append(__create_dict_entry(field.name, f"self.{field.name}")) + elif is_dataclass(field.type): + body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serlialize__()")) + else: + __throw_unsupported_data_type() + body_lines.append("}") + return body_lines + + +def __is_iterable_of_primitives(field): + return field.__origin__ in __iterables and field.__args__[0] in __primitives + + +def __throw_unsupported_data_type(): + raise TypeError("Unsupported data type. Only primitives, lists of primitives, " + "@serializable and @dataclass objects can be seralized") + + +def __create_dict_entry(key, value): + return f'\'{key}\': {value},' + + +def __is_primitive(obj): + return obj.type in __primitives + From f488e7369058bc0a6c13c6652d812cdb8e03c7d6 Mon Sep 17 00:00:00 2001 From: "w.jurasz" Date: Sun, 23 Jun 2019 00:20:44 +0200 Subject: [PATCH 49/58] Improced serialize to support torch.nn.Module --- sonosco/model.py | 52 ++---------- sonosco/models/deepspeech2_sonosco.py | 118 ++++++++++++++++++++++++++ sonosco/serialization.py | 23 ++++- 3 files changed, 144 insertions(+), 49 deletions(-) create mode 100644 sonosco/models/deepspeech2_sonosco.py diff --git a/sonosco/model.py b/sonosco/model.py index 6d12aec..c943244 100644 --- a/sonosco/model.py +++ b/sonosco/model.py @@ -1,10 +1,12 @@ import logging + import torch import deprecation import inspect import torch.nn as nn from common.class_utils import get_constructor_args, get_class_by_name +from serialization import is_serializable LOGGER = logging.getLogger(__name__) @@ -28,8 +30,7 @@ def save_model_simple(self, model: nn.Module, path: str) -> None: """ torch.save(model, path) - def save_model(self, model: nn.Module, path: str, infer_structure: bool = False, - serialize_method_name: str = 'serialize') -> None: + def save_model(self, model: nn.Module, path: str) -> None: """ Saves the model using pickle protocol. @@ -51,50 +52,11 @@ def save_model(self, model: nn.Module, path: str, infer_structure: bool = False, Returns: """ - if infer_structure: - entity_to_save = self.get_constructor_args_with_values(model) - entity_to_save['state_dict'] = model.state_dict() - elif hasattr(model, serialize_method_name) and callable(getattr(model, serialize_method_name)): - entity_to_save = getattr(model, serialize_method_name)() + if is_serializable(model): + entity_to_save = model.__serialize__() + torch.save(entity_to_save, path) else: - entity_to_save = {'state_dict': model.state_dict()} - - torch.save(entity_to_save, path) - - @staticmethod - def get_constructor_args_with_values(model: nn.Module): - """ - Assigns values to __init__ params names - - For example: - - class Bar(): - def __init__(self, arg1, arg2): - self.arg1 = arg1 - self.some_other_name = args2 - - - bar = Bar("A","B") - get_constructor_args_with_values(bar) - # returns {arg1: arg1_val, arg2: arg2_val} - - - Args: - model (nn.Module): model to infer from - - Returns (dict): Mapping from __init__ argument to it's value - - """ - constructor_args = get_constructor_args(model) - model_attributes = model.__dict__ - attributes_names = set(model_attributes.keys()) - - ambiguous_arguments = constructor_args - attributes_names - - if ambiguous_arguments: - LOGGER.warning(f"Some constructor arguments do not have equivalent fields ") - - return {} + raise TypeError("Only @serializable class can be serialized") class Loader: diff --git a/sonosco/models/deepspeech2_sonosco.py b/sonosco/models/deepspeech2_sonosco.py new file mode 100644 index 0000000..1e1b1c3 --- /dev/null +++ b/sonosco/models/deepspeech2_sonosco.py @@ -0,0 +1,118 @@ +import math +from collections import OrderedDict +from dataclasses import field + +import torch +from torch import nn + +from models.deepspeech2 import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, \ + supported_rnns_inv +from serialization import serializable + + +@serializable +class DeepSpeech2(nn.Module): + rnn_type: nn.RNNBase = nn.LSTM + labels: str = "abc" + rnn_hid_size: int = 768 + nb_layers: int = 5 + audio_conf: dict = field(default_factory={}) + bidirectional: bool = True + version: str = '0.0.1' + + def __post__init__(self): + sample_rate = self.audio_conf.get("sample_rate", 16000) + window_size = self.audio_conf.get("window_size", 0.02) + num_classes = len(self.labels) + self.conv = MaskConv(nn.Sequential( + nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)), + nn.BatchNorm2d(32), + nn.Hardtanh(0, 20, inplace=True), + nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)), + nn.BatchNorm2d(32), + nn.Hardtanh(0, 20, inplace=True) + )) + # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 + rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1) + rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1) + rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1) + rnn_in_size *= 32 + + rnns = [('0', BatchRNN(input_size=rnn_in_size, hidden_size=self.rnn_hid_size, rnn_type=self.rnn_type, batch_norm=False))] + rnns.extend([(f"{x + 1}", BatchRNN(input_size=self.rnn_hid_size, hidden_size=self.rnn_hid_size, rnn_type=self.rnn_type)) + for x in range(self.nb_layers - 1)]) + self.rnns = nn.Sequential(OrderedDict(rnns)) + + fully_connected = nn.Sequential( + nn.BatchNorm1d(self.rnn_hid_size), + nn.Linear(self.rnn_hid_size, num_classes, bias=False) + ) + + self.fc = nn.Sequential( + SequenceWise(fully_connected), + ) + + self.inference_softmax = InferenceBatchSoftmax() + + def forward(self, x, lengths): + # if x.is_cuda and self.mixed_precision: + # x = x.half() + lengths = lengths.cpu().int() + output_lengths = self.get_seq_lens(lengths) + x, _ = self.conv(x, output_lengths) + + sizes = x.size() + x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension + x = x.transpose(1, 2).transpose(0, 1).contiguous() # TxNxH + + for rnn in self.rnns: + x = rnn(x, output_lengths) + + if not self.bidirectional: # no need for lookahead layer in bidirectional + x = self.lookahead(x) + + x = self.fc(x) + x = x.transpose(0, 1) + # identity in training mode, softmax in eval mode + x = self.inference_softmax(x) + return x, output_lengths + + def get_seq_lens(self, input_length): + """ + Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable + containing the size sequences that will be output by the network. + :param input_length: 1D Tensor + :return: 1D Tensor scaled by model + """ + seq_len = input_length + for m in self.conv.modules(): + if type(m) == nn.modules.conv.Conv2d: + seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1) + return seq_len.int() + + @staticmethod + def get_param_size(model): + params = 0 + for p in model.parameters(): + tmp = 1 + for x in p.size(): + tmp *= x + params += tmp + return params + + def __repr__(self): + rep = f"DeepSpeech2 version: {self.version}\n" + \ + "=======================================" + \ + "Recurrent Neural Network Properties\n" + \ + f" RNN Type: \t{self.rnn_type.__name__.lower()}\n" + \ + f" RNN Layers:\t{self.hidden_layers}\n" + \ + f" RNN Size: \t{self.hidden_size}\n" + \ + f" Classes: \t{len(self.labels)}\n" + \ + "---------------------------------------\n" + \ + "Model Features\n" + \ + f" Labels: \t{self.labels}\n" + \ + f" Sample Rate: \t{self.audio_conf.get('sample_rate', 'n/a')}\n" + \ + f" Window Type: \t{self.audio_conf.get('window', 'n/a')}\n" + \ + f" Window Size: \t{self.audio_conf.get('window_size', 'n/a')}\n" + \ + f" Window Stride:\t{self.audio_conf.get('window_stride', 'n/a')}" + return rep diff --git a/sonosco/serialization.py b/sonosco/serialization.py index 24b136b..cae248e 100644 --- a/sonosco/serialization.py +++ b/sonosco/serialization.py @@ -29,33 +29,46 @@ def wrap(cls): return wrap(_cls) +def is_serializable(obj): + return hasattr(obj, '__serialize__') + def __add_serialize(cls): fields_to_serialize = fields(cls) sonosco_self = ['__sonosco_self__' if 'self' in fields_to_serialize else 'self'] - serialize_body = __create_serialize_body(fields_to_serialize) + serialize_body = __create_serialize_body(cls, fields_to_serialize) return _create_fn('__serialize__', [sonosco_self], [f'return {serialize_body}'], return_type=dict) -def __create_serialize_body(fields_to_serialize): +def __create_serialize_body(cls, fields_to_serialize): body_lines = ["{"] for field in fields_to_serialize: if __is_primitive(field) or __is_iterable_of_primitives(field): body_lines.append(__create_dict_entry(field.name, f"self.{field.name}")) elif is_dataclass(field.type): body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serlialize__()")) + elif __is_nn_class(field.type): + body_lines.append("'{}': {".format(field.name)) + __extract_from_nn(cls, body_lines) + body_lines.append("}") else: __throw_unsupported_data_type() + body_lines.append(__create_dict_entry("state_dict", "self.state_dict()")) body_lines.append("}") return body_lines +def __extract_from_nn(cls, body_lines): + constants = list(filter(lambda el: not el.startswith('_'), cls.__constants__)) + for constant in constants: + body_lines.append(__create_dict_entry(constant, f"self.{constant}")) + def __is_iterable_of_primitives(field): return field.__origin__ in __iterables and field.__args__[0] in __primitives def __throw_unsupported_data_type(): - raise TypeError("Unsupported data type. Only primitives, lists of primitives, " - "@serializable and @dataclass objects can be seralized") + raise TypeError("Unsupported data type. Only primitives, lists of primitives, torch.nn.Module" + "@serializable and @dataclass objects can be serialized") def __create_dict_entry(key, value): @@ -65,3 +78,5 @@ def __create_dict_entry(key, value): def __is_primitive(obj): return obj.type in __primitives +def __is_nn_class(cls): + return hasattr(cls, '__constants__') \ No newline at end of file From fc1431f8eaf0ed0925848116be30714c7f6468b2 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Sun, 23 Jun 2019 15:02:15 +0200 Subject: [PATCH 50/58] fix deepspeech2 and first successful training run --- sonosco/common/path_utils.py | 2 +- sonosco/config/train_librispeech.yaml | 2 +- sonosco/datasets/data_loader.py | 6 +++--- sonosco/models/deepspeech2.py | 12 +++++++++--- sonosco/run_training.py | 12 +++++++++--- sonosco/training/trainer.py | 13 +++++-------- 6 files changed, 28 insertions(+), 19 deletions(-) diff --git a/sonosco/common/path_utils.py b/sonosco/common/path_utils.py index 03321bb..2199b8f 100644 --- a/sonosco/common/path_utils.py +++ b/sonosco/common/path_utils.py @@ -16,4 +16,4 @@ def try_download(destination: str, url: str): def parse_yaml(file_path: str): with codecs.open(file_path, "r", "utf-8") as file: - return yaml.load(file) + return yaml.load(file, Loader=yaml.FullLoader) diff --git a/sonosco/config/train_librispeech.yaml b/sonosco/config/train_librispeech.yaml index eba936b..64d6294 100644 --- a/sonosco/config/train_librispeech.yaml +++ b/sonosco/config/train_librispeech.yaml @@ -37,7 +37,7 @@ train: seed: 123456 # Seed to generators cuda: True # Use cuda to train model - half_precision: Trues # Uses half precision to train a model + half_precision: True # Uses half precision to train a model apex: True # Uses mixed precision to train a model static_loss_scaling: False # Static loss scale for mixed precision dynamic_loss_scaling: True # Use dynamic loss scaling for mixed precision diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/data_loader.py index 74912d6..79ed208 100644 --- a/sonosco/datasets/data_loader.py +++ b/sonosco/datasets/data_loader.py @@ -24,7 +24,7 @@ def _collate_fn(self, batch): freq_size, max_seqlength = longest_sample.size() minibatch_size = len(batch) inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) - input_percentages = torch.FloatTensor(minibatch_size) + input_lengths = torch.IntTensor(minibatch_size) target_sizes = np.zeros(minibatch_size, dtype=np.int32) # TODO: Numpy broadcasting magic @@ -32,11 +32,11 @@ def _collate_fn(self, batch): for x in range(minibatch_size): inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) - input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) + input_lengths[x] = batch[x][0].size(1) target_sizes[x] = len(batch[x][1]) targets.extend(batch[x][1]) - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) + return inputs, torch.IntTensor(targets), input_lengths, torch.from_numpy(target_sizes) def create_data_loaders(**kwargs): diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py index dcfa100..35645d2 100644 --- a/sonosco/models/deepspeech2.py +++ b/sonosco/models/deepspeech2.py @@ -7,10 +7,13 @@ from collections import OrderedDict import torch +import logging import torch.nn as nn import torch.nn.functional as F +LOGGER = logging.getLogger(__name__) + supported_rnns = { 'lstm': nn.LSTM, 'rnn': nn.RNN, @@ -82,13 +85,14 @@ def forward(self, input_): class BatchRNN(nn.Module): - def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True): + def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=False): super(BatchRNN, self).__init__() + self.bidirectional = bidirectional self.input_size = input_size self.hidden_size = hidden_size self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, - bidirectional=True, bias=True) + bidirectional=bidirectional, bias=True) def flatten_parameters(self): self.rnn.flatten_parameters() @@ -134,7 +138,8 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5 nn.Hardtanh(0, 20, inplace=True) )) # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 - rnn_in_size = int(math.floor((sample_rate * window_size) / 2) + 1) + rnn_in_size = int(math.floor((sample_rate * window_size) / 4) + 1) + LOGGER.debug(f"Initial calculated feature size: {rnn_in_size}") rnn_in_size = int(math.floor(rnn_in_size + 2 * 20 - 41) / 2 + 1) rnn_in_size = int(math.floor(rnn_in_size + 2 * 10 - 21) / 2 + 1) rnn_in_size *= 32 @@ -158,6 +163,7 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5 def forward(self, x, lengths): # if x.is_cuda and self.mixed_precision: # x = x.half() + LOGGER.debug(f"Actual initial size: {x.size()}") lengths = lengths.cpu().int() output_lengths = self.get_seq_lens(lengths) x, _ = self.conv(x, output_lengths) diff --git a/sonosco/run_training.py b/sonosco/run_training.py index b5485e3..76e0a4e 100644 --- a/sonosco/run_training.py +++ b/sonosco/run_training.py @@ -22,12 +22,18 @@ def main(experiment_name, config_path): train_loader, val_loader = create_data_loaders(**config) + def custom_loss(batch, model): + batch_x, batch_y, input_lengths, target_lengths = batch + model_output, output_lengths = model(batch_x, input_lengths) + loss = torch_functional.ctc_loss(model_output.transpose(0, 1), batch_y, output_lengths, target_lengths) + return loss, model_output + # TODO: change to load different models dynamically - model = DeepSpeech2() + model = DeepSpeech2(labels=config["labels"]) - trainer = ModelTrainer(model, loss=torch_functional.ctc_loss, epochs=config["max_epochs"], + trainer = ModelTrainer(model, loss=custom_loss, epochs=config["max_epochs"], train_data_loader=train_loader, val_data_loader=val_loader, - lr=config["learning_rate"]) + lr=config["learning_rate"], custom_model_eval=True) try: trainer.start_training() diff --git a/sonosco/training/trainer.py b/sonosco/training/trainer.py index 15841b7..7c9bd83 100644 --- a/sonosco/training/trainer.py +++ b/sonosco/training/trainer.py @@ -26,14 +26,11 @@ class ModelTrainer: custom_model_eval (boolean, optional): enables training mode where the model is evaluated in the loss function gpu (int, optional): if not set training runs on cpu, otherwise an int is expected that determines the training gpu clip_grads (float, optional): if set training gradients will be clipped at specified norm - Example: - >>> model_trainer = ModelTrainer(model, optimizer, F.nll_loss, num_epochs, train_loader, gpu=0) - >>> model_trainer.start_training() """ def __init__(self, model: torch.nn.Module, - loss: Union[Callable[[torch.Tensor, torch.Tensor], float], + loss: Union[Callable[[Any, Any], Any], Callable[[torch.Tensor, torch.Tensor, torch.nn.Module], float]], epochs: int, train_data_loader: DataLoader, @@ -95,8 +92,8 @@ def _epoch_step(self, epoch): running_batch_loss = 0 running_metrics = defaultdict(float) - for step, batch in enumerate(self.train_data_loader): - import pdb; pdb.set_trace() + for step, (batch_x, batch_y, input_lengths, target_lengths) in enumerate(self.train_data_loader): + batch = (batch_x, batch_y, input_lengths, target_lengths) batch = self._recursive_to_cuda(batch) # move to GPU # compute training batch @@ -131,11 +128,11 @@ def _comp_gradients(self): def _train_on_batch(self, batch): """ Compute loss depending on settings, compute gradients and apply optimization step. """ # evaluate loss - batch_x, batch_y = batch + batch_x, batch_y, input_lengths, target_lengths = batch if self._custom_model_eval: loss, model_output = self.loss(batch, self.model) else: - model_output = self.model(batch_x) + model_output = self.model(batch_x, input_lengths) loss = self.loss(model_output, batch_y) self.optimizer.zero_grad() # reset gradients From 57caebc0280db70ef7ca4ff44ffd6b4aba9014ed Mon Sep 17 00:00:00 2001 From: ga38nif Date: Sun, 23 Jun 2019 15:27:30 +0200 Subject: [PATCH 51/58] rework audio dataloader and make it nicer --- sonosco/datasets/audio_data_loader.py | 40 ++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/sonosco/datasets/audio_data_loader.py b/sonosco/datasets/audio_data_loader.py index 967b89c..0c44177 100644 --- a/sonosco/datasets/audio_data_loader.py +++ b/sonosco/datasets/audio_data_loader.py @@ -7,27 +7,29 @@ class AudioDataLoader(DataLoader): def __init__(self, *args, **kwargs): - """Creates a data loader for AudioDatasets.""" + ''' + Creates a data loader for AudioDatasets. + ''' super(AudioDataLoader, self).__init__(*args, **kwargs) self.collate_fn = self._collate_fn - # TODO: Optimise def _collate_fn(self, batch): + #sort the batch in decreasing order of sequence length batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) - longest_sample = batch[0][0] - freq_size, max_seqlength = longest_sample.size() - minibatch_size = len(batch) - inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) - input_percentages = torch.FloatTensor(minibatch_size) - target_sizes = np.zeros(minibatch_size, dtype=np.int32) - - # TODO: Numpy broadcasting magic - targets = [] - - for x in range(minibatch_size): - inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) - input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) - target_sizes[x] = len(batch[x][1]) - targets.extend(batch[x][1]) - - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) \ No newline at end of file + + #pad the tensors to have equal lengths, therefore transpose the tensors in + #the batch. The tensors have shape: freq_size x sequence_length + #and need to be of shape: sequence_length x freq_length, as sequence length differs + #but not the freq_length + inputs = torch.nn.utils.rnn.pad_sequence(list(map(lambda x: x[0].transpose(0,1), batch)), batch_first=True) + + #inputs need to be transposed back from shape batch_size x sequence_length x freq_length + #to batch_size x freq_length x sequence_length. Additionally, unsqueeze tensor + inputs = inputs.transpose(1,2).unsqueeze(1) + input_lengths = torch.IntTensor(list(map(lambda x: x[0].size(1), batch))) #create tensor of input lengths + + targets_arr = list(zip(*batch))[1] #extract targets array from batch ( batch is array of tuples) + target_lengths = torch.IntTensor(list(map(lambda x: len(x),targets_arr))) #create tensor of target lengths + targets = torch.cat(list(map(lambda x: torch.IntTensor(x), targets_arr))) #create tensor of targets + + return inputs, targets, input_lengths, target_lengths From e3e21d88d8ace89f47c8312ede5e3e621c23d43b Mon Sep 17 00:00:00 2001 From: Yuriy Arabskyy Date: Mon, 24 Jun 2019 14:28:19 +0200 Subject: [PATCH 52/58] Fix typo --- sonosco/serialization.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sonosco/serialization.py b/sonosco/serialization.py index cae248e..9f905b6 100644 --- a/sonosco/serialization.py +++ b/sonosco/serialization.py @@ -2,6 +2,7 @@ __primitives = {int, float, str, bool} __iterables = [list, set, tuple] + def serializable(_cls=None): """ @@ -32,6 +33,7 @@ def wrap(cls): def is_serializable(obj): return hasattr(obj, '__serialize__') + def __add_serialize(cls): fields_to_serialize = fields(cls) sonosco_self = ['__sonosco_self__' if 'self' in fields_to_serialize else 'self'] @@ -45,7 +47,7 @@ def __create_serialize_body(cls, fields_to_serialize): if __is_primitive(field) or __is_iterable_of_primitives(field): body_lines.append(__create_dict_entry(field.name, f"self.{field.name}")) elif is_dataclass(field.type): - body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serlialize__()")) + body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serialize__()")) elif __is_nn_class(field.type): body_lines.append("'{}': {".format(field.name)) __extract_from_nn(cls, body_lines) @@ -56,6 +58,7 @@ def __create_serialize_body(cls, fields_to_serialize): body_lines.append("}") return body_lines + def __extract_from_nn(cls, body_lines): constants = list(filter(lambda el: not el.startswith('_'), cls.__constants__)) for constant in constants: @@ -78,5 +81,6 @@ def __create_dict_entry(key, value): def __is_primitive(obj): return obj.type in __primitives + def __is_nn_class(cls): - return hasattr(cls, '__constants__') \ No newline at end of file + return hasattr(cls, '__constants__') From 341e32abb89f53f63b6faa1ce6499c90b5fee96f Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Mon, 24 Jun 2019 22:47:51 +0200 Subject: [PATCH 53/58] resolve comments regarding noise makers --- .gitignore | 2 -- setup.py | 4 ++-- sonosco/common/audio_tools.py | 19 ++----------------- sonosco/common/noise_makers.py | 18 ++++++++++++++++++ 4 files changed, 22 insertions(+), 21 deletions(-) create mode 100644 sonosco/common/noise_makers.py diff --git a/.gitignore b/.gitignore index dcee099..58fc0df 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,6 @@ sonosco/pycandle/ experiments/ tests/test_wavs/ -sonosco/pycandle -sonosco/experiments/ **/.DS_Store diff --git a/setup.py b/setup.py index 207c04e..4a1ac9b 100644 --- a/setup.py +++ b/setup.py @@ -2,8 +2,8 @@ setup( name="sonosco", - description="Framework for training automatic speech recognition systems.", - author="The Roboy Gang", + description="Framework for deep automatic speech recognition systems.", + author="Roboy", packages=["sonosco"], include_package_data=True, dependency_links=[] diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py index 9ab77ab..807e79b 100644 --- a/sonosco/common/audio_tools.py +++ b/sonosco/common/audio_tools.py @@ -2,6 +2,8 @@ import numpy as np import librosa +from .noise_makers import NoiseMaker, GaussianNoiseMaker + def get_duration(file_path): return float(subprocess.check_output([f'soxi -D "{file_path.strip()}"'], shell=True)) @@ -19,23 +21,6 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_ subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True) -class NoiseMaker: - - def __call__(self, audio): - """Adds noise to the audio signal.""" - pass - - -class GaussianNoiseMaker(NoiseMaker): - - def __init__(self, std=0.002): - self.std = std - - def __call__(self, audio): - noise = np.random.randn(len(audio)) - return audio + self.std * noise - - def add_noise(audio, noise_maker: NoiseMaker = None): if noise_maker is None: noise_maker = GaussianNoiseMaker() diff --git a/sonosco/common/noise_makers.py b/sonosco/common/noise_makers.py new file mode 100644 index 0000000..551e2ec --- /dev/null +++ b/sonosco/common/noise_makers.py @@ -0,0 +1,18 @@ +import numpy as np + + +class NoiseMaker: + + def __call__(self, audio): + """Adds noise to the audio signal.""" + pass + + +class GaussianNoiseMaker(NoiseMaker): + + def __init__(self, std=0.002): + self.std = std + + def __call__(self, audio): + noise = np.random.randn(len(audio)) + return audio + self.std * noise From 37d9a1c6b3bc4e2bd84553bd931f100458f99b38 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Mon, 24 Jun 2019 23:43:48 +0200 Subject: [PATCH 54/58] resolve comments --- .gitignore | 1 - requirements.txt | 1 + sonosco/datasets/__init__.py | 6 +- sonosco/datasets/audio_dataset.py | 137 ------------------ sonosco/datasets/dataset.py | 53 +++++++ sonosco/datasets/download_datasets/an4.py | 12 +- .../download_datasets/common_voice.py | 5 +- .../download_datasets/merge_manifests.py | 7 +- .../datasets/{data_loader.py => loader.py} | 4 +- .../datasets/{data_sampler.py => samplers.py} | 2 +- tests/test_dataset.py | 6 +- 11 files changed, 76 insertions(+), 158 deletions(-) delete mode 100644 sonosco/datasets/audio_dataset.py create mode 100644 sonosco/datasets/dataset.py rename sonosco/datasets/{data_loader.py => loader.py} (96%) rename sonosco/datasets/{data_sampler.py => samplers.py} (97%) diff --git a/.gitignore b/.gitignore index 58fc0df..9717b09 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ # Created by .ignore support plugin (hsz.mobi) -sonosco/pycandle/ experiments/ tests/test_wavs/ diff --git a/requirements.txt b/requirements.txt index ca4f802..8ae1526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ wget==3.2 pytest==4.6.3 click==7.0 deprecation==2.0.6 +dataclasses==0.6 diff --git a/sonosco/datasets/__init__.py b/sonosco/datasets/__init__.py index 9cfa7a3..b74d5b6 100644 --- a/sonosco/datasets/__init__.py +++ b/sonosco/datasets/__init__.py @@ -1,3 +1,3 @@ -from .audio_dataset import AudioDataProcessor, AudioDataset -from .data_sampler import BucketingSampler -from .data_loader import AudioDataLoader, create_data_loaders +from .dataset import AudioDataProcessor, AudioDataset +from .samplers import BucketingSampler +from .loader import AudioDataLoader, create_data_loaders diff --git a/sonosco/datasets/audio_dataset.py b/sonosco/datasets/audio_dataset.py deleted file mode 100644 index 74683f0..0000000 --- a/sonosco/datasets/audio_dataset.py +++ /dev/null @@ -1,137 +0,0 @@ -# ---------------------------------------------------------------------------- -# Based on SeanNaren's deepspeech.pytorch: -# https://github.com/SeanNaren/deepspeech.pytorch -# ---------------------------------------------------------------------------- - -import logging -import torch -import librosa -import numpy as np -import sonosco.config.global_settings as global_settings -import sonosco.common.audio_tools as audio_tools -import sonosco.common.utils as utils - -from torch.utils.data import Dataset - - -LOGGER = logging.getLogger(__name__) -MIN_STRETCH = 0.7 -MAX_STRETCH = 1.3 -MIN_PITCH = 0.7 -MAX_PITCH = 1.5 -MAX_SHIFT = 4000 - - -class AudioDataProcessor: - - def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False, **kwargs): - """ - Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by - a comma. Each new line is a different sample. Example below: - /path/to/audio.wav,/path/to/audio.txt - ... - :param window_stride: number of seconds to skip between each window - :param window_size: number of seconds to use for a window of spectrogram - :param sample_rate: sample rate of the recordings - :param labels: string containing all the possible characters to map to - :param normalize: apply standard mean and deviation normalization to audio tensor - :param augment(default False): apply random tempo and gain perturbations - """ - self.window_stride = window_stride - self.window_size = window_size - self.sample_rate = sample_rate - self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) - self.normalize = normalize - self.augment = augment - - @property - def window_stride_samples(self): - return int(self.sample_rate * self.window_stride) - - @property - def window_size_samples(self): - return int(self.sample_rate * self.window_stride) - - def retrieve_file(self, audio_path): - sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate) - return sound, sample_rate - - def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True): - augmented = audio_tools.stretch(sound, utils.random_float(MIN_STRETCH, MAX_STRETCH)) if stretch else sound - augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented - augmented = audio_tools.pitch_shift(augmented, self.sample_rate, - n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented - augmented = audio_tools.add_noise(augmented) if noise else augmented - return augmented - - def parse_audio(self, audio_path, raw=False): - sound, sample_rate = self.retrieve_file(audio_path) - - if sample_rate != self.sample_rate: - raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") - - if self.augment: - sound = self.augment_audio(sound) - - if raw: - return sound - - # TODO: comment why take the last element? - complex_spectrogram = librosa.stft(sound, - n_fft=self.window_size_samples, - hop_length=self.window_stride_samples, - win_length=self.window_size_samples) - spectrogram, phase = librosa.magphase(complex_spectrogram) - # S = log(S+1) - spectrogram = torch.from_numpy(np.log1p(spectrogram)) - - return spectrogram - - def parse_transcript(self, transcript_path): - with open(transcript_path, 'r', encoding='utf8') as transcript_file: - transcript = transcript_file.read().replace('\n', '') - # TODO: Is it fast enough? - transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) - LOGGER.debug(f"transcript_path: {transcript_path} transcript: {transcript}") - return transcript - - -class AudioDataset(Dataset): - - def __init__(self, processor: AudioDataProcessor, manifest_filepath): - """ - Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by - a comma. Each new line is a different sample. Example below: - /path/to/audio.wav,/path/to/audio.txt - ... - :param processor: Data processor object - :param manifest_filepath: Path to manifest csv as describe above - """ - super().__init__() - with open(manifest_filepath) as f: - ids = f.readlines() - ids = [x.strip().split(',') for x in ids] - self.ids = ids - self.size = len(ids) - self.processor = processor - - def get_raw(self, index): - sample = self.ids[index] - audio_path, transcript_path = sample[0], sample[1] - - sound = self.processor.parse_audio(audio_path, raw=True) - transcript = self.processor.parse_transcript(transcript_path) - - return sound, transcript - - def __getitem__(self, index): - sample = self.ids[index] - audio_path, transcript_path = sample[0], sample[1] - - spectrogram = self.processor.parse_audio(audio_path) - transcript = self.processor.parse_transcript(transcript_path) - - return spectrogram, transcript - - def __len__(self): - return self.size diff --git a/sonosco/datasets/dataset.py b/sonosco/datasets/dataset.py new file mode 100644 index 0000000..8345f30 --- /dev/null +++ b/sonosco/datasets/dataset.py @@ -0,0 +1,53 @@ +# ---------------------------------------------------------------------------- +# Based on SeanNaren's deepspeech.pytorch: +# https://github.com/SeanNaren/deepspeech.pytorch +# ---------------------------------------------------------------------------- + +import logging + +from torch.utils.data import Dataset +from .processor import AudioDataProcessor + + +LOGGER = logging.getLogger(__name__) + + +class AudioDataset(Dataset): + + def __init__(self, processor: AudioDataProcessor, manifest_filepath): + """ + Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by + a comma. Each new line is a different sample. Example below: + /path/to/audio.wav,/path/to/audio.txt + ... + :param processor: Data processor object + :param manifest_filepath: Path to manifest csv as describe above + """ + super().__init__() + with open(manifest_filepath) as f: + ids = f.readlines() + ids = [x.strip().split(',') for x in ids] + self.ids = ids + self.size = len(ids) + self.processor = processor + + def get_raw(self, index): + sample = self.ids[index] + audio_path, transcript_path = sample[0], sample[1] + + sound = self.processor.parse_audio(audio_path, raw=True) + transcript = self.processor.parse_transcript(transcript_path) + + return sound, transcript + + def __getitem__(self, index): + sample = self.ids[index] + audio_path, transcript_path = sample[0], sample[1] + + spectrogram = self.processor.parse_audio(audio_path) + transcript = self.processor.parse_transcript(transcript_path) + + return spectrogram, transcript + + def __len__(self): + return self.size diff --git a/sonosco/datasets/download_datasets/an4.py b/sonosco/datasets/download_datasets/an4.py index 1d4368b..374cb5b 100644 --- a/sonosco/datasets/download_datasets/an4.py +++ b/sonosco/datasets/download_datasets/an4.py @@ -15,6 +15,7 @@ AN4_URL = 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz' + def try_download_an4(target_dir, sample_rate, min_duration, max_duration): path_to_data = os.path.join(os.path.expanduser("~"), target_dir) if not os.path.exists(path_to_data): @@ -44,6 +45,7 @@ def try_download_an4(target_dir, sample_rate, min_duration, max_duration): create_manifest(path_to_data, os.path.join(path_to_data,'an4_train_manifest.csv'), min_duration, max_duration) create_manifest(path_to_data, os.path.join(path_to_data,'an4_val_manifest.csv'), min_duration, max_duration) + def create_wav_and_transcripts(path, data_tag, sample_rate, extracted_dir, wav_subfolder_name): tag_path = os.path.join(path,data_tag) transcript_path_new = os.path.join(tag_path, 'txt') @@ -59,6 +61,7 @@ def create_wav_and_transcripts(path, data_tag, sample_rate, extracted_dir, wav_s convert_audio_to_wav(path, sample_rate) format_files(file_ids, transcript_path_new, wav_path_new, transcripts_ext, wav_path_ext) + def convert_audio_to_wav(train_path, sample_rate): with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe: for line in pipe: @@ -88,6 +91,7 @@ def _process_transcript(transcripts, x): extracted_transcript = transcripts[x].split('(')[0].strip("").split('<')[0].strip().upper() return extracted_transcript + @click.command() @click.option("--target-dir", default="temp/data/an4", type=str, help="Directory to store the dataset.") @click.option("--sample-rate", default=16000, type=int, help="Sample rate.") @@ -95,14 +99,12 @@ def _process_transcript(transcripts, x): help="Prunes training samples shorter than the min duration (given in seconds).") @click.option("--max-duration", default=15, type=int, help="Prunes training samples longer than the max duration (given in seconds).") - def main(**kwargs): """Processes and downloads an4 dataset.""" - global LOGGER - logger = logging.getLogger(SONOSCO) - setup_logging(logger) try_download_an4(**kwargs) if __name__ == '__main__': - main() \ No newline at end of file + LOGGER = logging.getLogger(SONOSCO) + setup_logging(LOGGER) + main() diff --git a/sonosco/datasets/download_datasets/common_voice.py b/sonosco/datasets/download_datasets/common_voice.py index 4dfc078..41bc102 100644 --- a/sonosco/datasets/download_datasets/common_voice.py +++ b/sonosco/datasets/download_datasets/common_voice.py @@ -56,7 +56,6 @@ def try_download_common_voice(target_dir, sample_rate, files_to_use, min_duratio max_duration) - def convert_to_wav(csv_file, target_dir, sample_rate): """ Read *.csv file description, convert mp3 to wav, process text. Save results to target_dir. @@ -76,7 +75,9 @@ def process(x): text = text.strip().upper() with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f: f.write(text) - audio_tools(source = os.path.join(path_to_data, file_path), destination=os.path.join(wav_dir, file_name + '.wav'), sample_rate = sample_rate) + audio_tools.transcode_recording(source=os.path.join(path_to_data, file_path), + destination=os.path.join(wav_dir, file_name + '.wav'), + sample_rate=sample_rate) LOGGER.info('Converting mp3 to wav for {}.'.format(csv_file)) with open(csv_file) as csvfile: diff --git a/sonosco/datasets/download_datasets/merge_manifests.py b/sonosco/datasets/download_datasets/merge_manifests.py index e5e0fab..6218d52 100644 --- a/sonosco/datasets/download_datasets/merge_manifests.py +++ b/sonosco/datasets/download_datasets/merge_manifests.py @@ -1,11 +1,10 @@ -from __future__ import print_function - import argparse import io import os from tqdm import tqdm -from utils import order_and_prune_files +from .data_utils import order_and_prune_files + parser = argparse.ArgumentParser(description='Merges all manifest CSV files in specified folder.') parser.add_argument('--merge-dir', default='manifests/', help='Path to all manifest files you want to merge') @@ -28,4 +27,4 @@ for wav_path in tqdm(file_paths, total=len(file_paths)): transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt') sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n' - file.write(sample.encode('utf-8')) \ No newline at end of file + file.write(sample.encode('utf-8')) diff --git a/sonosco/datasets/data_loader.py b/sonosco/datasets/loader.py similarity index 96% rename from sonosco/datasets/data_loader.py rename to sonosco/datasets/loader.py index f6f7642..92c3951 100644 --- a/sonosco/datasets/data_loader.py +++ b/sonosco/datasets/loader.py @@ -3,8 +3,8 @@ import torch from torch.utils.data import Dataset, DataLoader, Sampler -from .audio_dataset import AudioDataProcessor, AudioDataset -from .data_sampler import BucketingSampler +from .dataset import AudioDataProcessor, AudioDataset +from .samplers import BucketingSampler LOGGER = logging.getLogger(__name__) diff --git a/sonosco/datasets/data_sampler.py b/sonosco/datasets/samplers.py similarity index 97% rename from sonosco/datasets/data_sampler.py rename to sonosco/datasets/samplers.py index b3bfc14..416b754 100644 --- a/sonosco/datasets/data_sampler.py +++ b/sonosco/datasets/samplers.py @@ -66,4 +66,4 @@ def shuffle(self, epoch): g = torch.Generator() g.manual_seed(epoch) bin_ids = list(torch.randperm(len(self.bins), generator=g)) - self.bins = [self.bins[i] for i in bin_ids] \ No newline at end of file + self.bins = [self.bins[i] for i in bin_ids] diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 69cb1cb..2acd1a9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -6,9 +6,9 @@ from sonosco.common.constants import SONOSCO from sonosco.common.utils import setup_logging -from sonosco.datasets.audio_dataset import AudioDataset, AudioDataProcessor -from sonosco.datasets.data_sampler import BucketingSampler -from sonosco.datasets.data_loader import DataLoader +from sonosco.datasets.dataset import AudioDataset, AudioDataProcessor +from sonosco.datasets.samplers import BucketingSampler +from sonosco.datasets.loader import DataLoader from sonosco.datasets.download_datasets.librispeech import try_download_librispeech From b46c558157e348172cdb0b26a213311f9e2460e0 Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Tue, 25 Jun 2019 13:56:55 +0200 Subject: [PATCH 55/58] restructure classes into separate modules --- sonosco/datasets/processor.py | 88 +++++++++++++++ sonosco/training/early_stopping.py | 45 ++++++++ sonosco/training/gradient_collector.py | 50 +++++++++ sonosco/training/helpers.py | 143 ------------------------- sonosco/training/history_recorder.py | 26 +++++ sonosco/training/model_checkpoint.py | 43 ++++++++ 6 files changed, 252 insertions(+), 143 deletions(-) create mode 100644 sonosco/datasets/processor.py create mode 100644 sonosco/training/early_stopping.py create mode 100644 sonosco/training/gradient_collector.py delete mode 100644 sonosco/training/helpers.py create mode 100644 sonosco/training/history_recorder.py create mode 100644 sonosco/training/model_checkpoint.py diff --git a/sonosco/datasets/processor.py b/sonosco/datasets/processor.py new file mode 100644 index 0000000..2bbf7a8 --- /dev/null +++ b/sonosco/datasets/processor.py @@ -0,0 +1,88 @@ +import logging +import torch +import librosa +import numpy as np +import sonosco.common.audio_tools as audio_tools +import sonosco.common.utils as utils + + +LOGGER = logging.getLogger(__name__) +MIN_STRETCH = 0.7 +MAX_STRETCH = 1.3 +MIN_PITCH = 0.7 +MAX_PITCH = 1.5 +MAX_SHIFT = 4000 + + +class AudioDataProcessor: + + def __init__(self, window_stride, window_size, sample_rate, labels="abc", normalize=False, augment=False, **kwargs): + """ + Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by + a comma. Each new line is a different sample. Example below: + /path/to/audio.wav,/path/to/audio.txt + ... + :param window_stride: number of seconds to skip between each window + :param window_size: number of seconds to use for a window of spectrogram + :param sample_rate: sample rate of the recordings + :param labels: string containing all the possible characters to map to + :param normalize: apply standard mean and deviation normalization to audio tensor + :param augment(default False): apply random tempo and gain perturbations + """ + self.window_stride = window_stride + self.window_size = window_size + self.sample_rate = sample_rate + self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) + self.normalize = normalize + self.augment = augment + + @property + def window_stride_samples(self): + return int(self.sample_rate * self.window_stride) + + @property + def window_size_samples(self): + return int(self.sample_rate * self.window_stride) + + def retrieve_file(self, audio_path): + sound, sample_rate = librosa.load(audio_path, sr=self.sample_rate) + return sound, sample_rate + + def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True): + augmented = audio_tools.stretch(sound, utils.random_float(MIN_STRETCH, MAX_STRETCH)) if stretch else sound + augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented + augmented = audio_tools.pitch_shift(augmented, self.sample_rate, + n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented + augmented = audio_tools.add_noise(augmented) if noise else augmented + return augmented + + def parse_audio(self, audio_path, raw=False): + sound, sample_rate = self.retrieve_file(audio_path) + + if sample_rate != self.sample_rate: + raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") + + if self.augment: + sound = self.augment_audio(sound) + + if raw: + return sound + + # TODO: comment why take the last element? + complex_spectrogram = librosa.stft(sound, + n_fft=self.window_size_samples, + hop_length=self.window_stride_samples, + win_length=self.window_size_samples) + spectrogram, phase = librosa.magphase(complex_spectrogram) + # S = log(S+1) + spectrogram = torch.from_numpy(np.log1p(spectrogram)) + + return spectrogram + + def parse_transcript(self, transcript_path): + with open(transcript_path, 'r', encoding='utf8') as transcript_file: + transcript = transcript_file.read().replace('\n', '') + # TODO: Is it fast enough? + transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) + LOGGER.debug(f"transcript_path: {transcript_path} transcript: {transcript}") + return transcript diff --git a/sonosco/training/early_stopping.py b/sonosco/training/early_stopping.py new file mode 100644 index 0000000..8eefa6b --- /dev/null +++ b/sonosco/training/early_stopping.py @@ -0,0 +1,45 @@ +import logging +import sys + +from .abstract_callback import AbstractCallback + + +LOGGER = logging.getLogger(__name__) + + +class EarlyStopping(AbstractCallback): + """ + Early Stopping to terminate training early if the monitored metric did not improve + over a number of epochs. + Args: + monitor (string): name of the relevant loss or metric (usually 'val_loss') + min_delta (float): minimum change in monitored metric to qualify as an improvement + patience (int): number of epochs to wait for an improvement before terminating the training + """ + + def __init__(self, monitor='val_loss', min_delta=0, patience=5): + self.monitor = monitor + self.min_delta = min_delta + self.patience = patience + self.last_best = sys.float_info.max + self.counter = 0 + self.stopped_epoch = 0 + + def __call__(self, epoch, step, performance_measures, context): + + if step != len(context.train_data_loader) - 1: # only continue at end of epoch + return + + if self.monitor not in performance_measures: + return + + current_loss = performance_measures[self.monitor] + if (self.last_best - current_loss) >= self.min_delta: + self.last_best = current_loss + self.counter = 0 + else: + self.counter += 1 + + if self.counter >= self.patience: + context._stop_training = True # make ModelTrainer stop + LOGGER.info(f"Early stopping after epoch {epoch}") diff --git a/sonosco/training/gradient_collector.py b/sonosco/training/gradient_collector.py new file mode 100644 index 0000000..386b874 --- /dev/null +++ b/sonosco/training/gradient_collector.py @@ -0,0 +1,50 @@ +import logging +import numpy as np +import torch + +from .abstract_callback import AbstractCallback + + +LOGGER = logging.getLogger(__name__) + + +class LayerwiseGradientNorm(AbstractCallback): + """ Collects the layer-wise gradient norms for each epoch. """ + + def __init__(self): + self.layer_grads = dict() + self._batch_layer_grads = dict() + + def __call__(self, epoch, step, performance_measures, context): + """ + Store gradient norms for each batch and compute means after the + epoch's last batch. + """ + self._store_batch_layer_grads(context.model) + + if step == (len(context.train_data_loader) - 1): # end of epoch + self._store_layer_grads() + self._batch_layer_grads = dict() + + def _store_batch_layer_grads(self, model): + """ Store gradient norm of each layer for current batch. """ + for name, param in model.named_parameters(): + + if not param.requires_grad or param.grad is None: + continue + + if not name in self._batch_layer_grads: + self._batch_layer_grads[name] = [] + + grad_norm = torch.sqrt(torch.sum(param.grad**2)).item() + self._batch_layer_grads[name].append(grad_norm) + + def _store_layer_grads(self): + """ Compute mean of all batch steps in epoch. """ + for name, grads in self._batch_layer_grads.items(): + + if name not in self.layer_grads: + self.layer_grads[name] = [] + + layer_epoch_grad = np.mean(grads) + self.layer_grads[name].append(layer_epoch_grad) diff --git a/sonosco/training/helpers.py b/sonosco/training/helpers.py deleted file mode 100644 index 893c14d..0000000 --- a/sonosco/training/helpers.py +++ /dev/null @@ -1,143 +0,0 @@ -import logging -import sys -import os.path as path -import numpy as np -import torch - -from collections import defaultdict -from .abstract_callback import AbstractCallback - - -LOGGER = logging.getLogger(__name__) - - -class HistoryRecorder(AbstractCallback): - """ Records all losses and metrics during training. """ - - def __init__(self, epoch_steps): - self.history = defaultdict(list) - self._epoch_steps = epoch_steps - - def __call__(self, epoch, step, performance_measures, context): - - if step % self._epoch_steps == 0: # only record at end of epoch - return - - for key, value in performance_measures.items(): - if type(value) == torch.Tensor: - value = value.item() - self.history[key].append(value) - - -class ModelCheckpoint(AbstractCallback): - """ - Saves the model and optimizer state at the point with lowest validation error throughout training. - Args: - output_path (string): path to directory where the checkpoint will be saved to - model_name (string): name of the checkpoint file - """ - - def __init__(self, output_path, model_name='model_checkpoint.pt'): - self.output_path = path.join(output_path, model_name) - self.best_val_score = sys.float_info.max - - def __call__(self, epoch, step, performance_measures, context): - - if 'val_loss' not in performance_measures: - return - - if performance_measures['val_loss'] < self.best_val_score: - self.best_val_score = performance_measures['val_loss'] - self._save_checkpoint(context.model, context.optimizer, epoch) - - def _save_checkpoint(self, model, optimizer, epoch): - LOGGER.info("Saving model at checkpoint.") - model.eval() - model_state_dict = model.state_dict() - optimizer_state_dict = optimizer.state_dict() - torch.save({'arch': model.__class__.__name__, - 'epoch': epoch, - 'model_state_dict': model_state_dict, - 'optimizer_state_dict': optimizer_state_dict - }, self.output_path) - model.train() - - -class LayerwiseGradientNorm(AbstractCallback): - """ Collects the layer-wise gradient norms for each epoch. """ - - def __init__(self): - self.layer_grads = dict() - self._batch_layer_grads = dict() - - def __call__(self, epoch, step, performance_measures, context): - """ - Store gradient norms for each batch and compute means after the - epoch's last batch. - """ - self._store_batch_layer_grads(context.model) - - if step == (len(context.train_data_loader) - 1): # end of epoch - self._store_layer_grads() - self._batch_layer_grads = dict() - - def _store_batch_layer_grads(self, model): - """ Store gradient norm of each layer for current batch. """ - for name, param in model.named_parameters(): - - if not param.requires_grad or param.grad is None: - continue - - if not name in self._batch_layer_grads: - self._batch_layer_grads[name] = [] - - grad_norm = torch.sqrt(torch.sum(param.grad**2)).item() - self._batch_layer_grads[name].append(grad_norm) - - def _store_layer_grads(self): - """ Compute mean of all batch steps in epoch. """ - for name, grads in self._batch_layer_grads.items(): - - if name not in self.layer_grads: - self.layer_grads[name] = [] - - layer_epoch_grad = np.mean(grads) - self.layer_grads[name].append(layer_epoch_grad) - - -class EarlyStopping(AbstractCallback): - """ - Early Stopping to terminate training early if the monitored metric did not improve - over a number of epochs. - Args: - monitor (string): name of the relevant loss or metric (usually 'val_loss') - min_delta (float): minimum change in monitored metric to qualify as an improvement - patience (int): number of epochs to wait for an improvement before terminating the training - """ - - def __init__(self, monitor='val_loss', min_delta=0, patience=5): - self.monitor = monitor - self.min_delta = min_delta - self.patience = patience - self.last_best = sys.float_info.max - self.counter = 0 - self.stopped_epoch = 0 - - def __call__(self, epoch, step, performance_measures, context): - - if step != len(context.train_data_loader) - 1: # only continue at end of epoch - return - - if self.monitor not in performance_measures: - return - - current_loss = performance_measures[self.monitor] - if (self.last_best - current_loss) >= self.min_delta: - self.last_best = current_loss - self.counter = 0 - else: - self.counter += 1 - - if self.counter >= self.patience: - context._stop_training = True # make ModelTrainer stop - LOGGER.info(f"Early stopping after epoch {epoch}") diff --git a/sonosco/training/history_recorder.py b/sonosco/training/history_recorder.py new file mode 100644 index 0000000..5737a7b --- /dev/null +++ b/sonosco/training/history_recorder.py @@ -0,0 +1,26 @@ +import logging +import torch + +from collections import defaultdict +from .abstract_callback import AbstractCallback + + +LOGGER = logging.getLogger(__name__) + + +class HistoryRecorder(AbstractCallback): + """ Records all losses and metrics during training. """ + + def __init__(self, epoch_steps): + self.history = defaultdict(list) + self._epoch_steps = epoch_steps + + def __call__(self, epoch, step, performance_measures, context): + + if step % self._epoch_steps == 0: # only record at end of epoch + return + + for key, value in performance_measures.items(): + if type(value) == torch.Tensor: + value = value.item() + self.history[key].append(value) diff --git a/sonosco/training/model_checkpoint.py b/sonosco/training/model_checkpoint.py new file mode 100644 index 0000000..78f8f4a --- /dev/null +++ b/sonosco/training/model_checkpoint.py @@ -0,0 +1,43 @@ +import logging +import sys +import os.path as path +import torch + +from .abstract_callback import AbstractCallback + + +LOGGER = logging.getLogger(__name__) + + +class ModelCheckpoint(AbstractCallback): + """ + Saves the model and optimizer state at the point with lowest validation error throughout training. + Args: + output_path (string): path to directory where the checkpoint will be saved to + model_name (string): name of the checkpoint file + """ + + def __init__(self, output_path, model_name='model_checkpoint.pt'): + self.output_path = path.join(output_path, model_name) + self.best_val_score = sys.float_info.max + + def __call__(self, epoch, step, performance_measures, context): + + if 'val_loss' not in performance_measures: + return + + if performance_measures['val_loss'] < self.best_val_score: + self.best_val_score = performance_measures['val_loss'] + self._save_checkpoint(context.model, context.optimizer, epoch) + + def _save_checkpoint(self, model, optimizer, epoch): + LOGGER.info("Saving model at checkpoint.") + model.eval() + model_state_dict = model.state_dict() + optimizer_state_dict = optimizer.state_dict() + torch.save({'arch': model.__class__.__name__, + 'epoch': epoch, + 'model_state_dict': model_state_dict, + 'optimizer_state_dict': optimizer_state_dict + }, self.output_path) + model.train() From 2ffa5120ef8b29e5936821f1e49d9fcf1633798f Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Tue, 25 Jun 2019 14:06:58 +0200 Subject: [PATCH 56/58] make add_noise a class member --- sonosco/common/audio_tools.py | 6 ------ sonosco/common/noise_makers.py | 8 +++++++- sonosco/datasets/processor.py | 7 ++++++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/sonosco/common/audio_tools.py b/sonosco/common/audio_tools.py index 807e79b..fa53021 100644 --- a/sonosco/common/audio_tools.py +++ b/sonosco/common/audio_tools.py @@ -21,12 +21,6 @@ def transcode_recordings_ted3(source, destination, start_time, end_time, sample_ subprocess.call([f"sox {source} -r {sample_rate} -b 16 -c 1 {destination} trim {start_time} ={end_time}"],shell=True) -def add_noise(audio, noise_maker: NoiseMaker = None): - if noise_maker is None: - noise_maker = GaussianNoiseMaker() - return noise_maker(audio) - - def shift(audio, n_samples=1600): return np.roll(audio, n_samples) diff --git a/sonosco/common/noise_makers.py b/sonosco/common/noise_makers.py index 551e2ec..a280e94 100644 --- a/sonosco/common/noise_makers.py +++ b/sonosco/common/noise_makers.py @@ -1,12 +1,18 @@ import numpy as np +from abc import ABC, abstractmethod -class NoiseMaker: +class NoiseMaker(ABC): + + @abstractmethod def __call__(self, audio): """Adds noise to the audio signal.""" pass + def add_noise(self, audio): + return self(audio) + class GaussianNoiseMaker(NoiseMaker): diff --git a/sonosco/datasets/processor.py b/sonosco/datasets/processor.py index 2bbf7a8..d8e91e2 100644 --- a/sonosco/datasets/processor.py +++ b/sonosco/datasets/processor.py @@ -4,6 +4,7 @@ import numpy as np import sonosco.common.audio_tools as audio_tools import sonosco.common.utils as utils +import sonosco.common.noise_makers as noise_makers LOGGER = logging.getLogger(__name__) @@ -53,7 +54,11 @@ def augment_audio(self, sound, stretch=True, shift=False, pitch=True, noise=True augmented = audio_tools.shift(augmented, np.random.randint(MAX_SHIFT)) if shift else augmented augmented = audio_tools.pitch_shift(augmented, self.sample_rate, n_steps=utils.random_float(MIN_PITCH, MAX_PITCH)) if pitch else augmented - augmented = audio_tools.add_noise(augmented) if noise else augmented + + if noise: + noise_maker = noise_makers.GaussianNoiseMaker() + augmented = noise_maker.add_noise(augmented) if noise else augmented + return augmented def parse_audio(self, audio_path, raw=False): From 70b6bfb123e1582c7bf57a5cfcf2033bdfe9ff2c Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Tue, 25 Jun 2019 14:18:26 +0200 Subject: [PATCH 57/58] restructure model loader --- sonosco/datasets/loader.py | 28 +-- sonosco/loader.py | 250 ------------------------- sonosco/models/deepspeech2.py | 99 +--------- sonosco/models/deepspeech2_sonosco.py | 11 +- sonosco/{model.py => models/loader.py} | 53 +----- sonosco/models/modules.py | 101 ++++++++++ sonosco/models/saver.py | 56 ++++++ sonosco/{ => models}/serialization.py | 0 8 files changed, 178 insertions(+), 420 deletions(-) delete mode 100644 sonosco/loader.py rename sonosco/{model.py => models/loader.py} (66%) create mode 100644 sonosco/models/modules.py create mode 100644 sonosco/models/saver.py rename sonosco/{ => models}/serialization.py (100%) diff --git a/sonosco/datasets/loader.py b/sonosco/datasets/loader.py index 92c3951..e23c2fd 100644 --- a/sonosco/datasets/loader.py +++ b/sonosco/datasets/loader.py @@ -1,8 +1,8 @@ -import numpy as np import logging import torch +import torch.nn -from torch.utils.data import Dataset, DataLoader, Sampler +from torch.utils.data import DataLoader from .dataset import AudioDataProcessor, AudioDataset from .samplers import BucketingSampler @@ -20,23 +20,23 @@ def __init__(self, *args, **kwargs): self.collate_fn = self._collate_fn def _collate_fn(self, batch): - #sort the batch in decreasing order of sequence length + # sort the batch in decreasing order of sequence length batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) - #pad the tensors to have equal lengths, therefore transpose the tensors in - #the batch. The tensors have shape: freq_size x sequence_length - #and need to be of shape: sequence_length x freq_length, as sequence length differs - #but not the freq_length + # pad the tensors to have equal lengths, therefore transpose the tensors in + # the batch. The tensors have shape: freq_size x sequence_length + # and need to be of shape: sequence_length x freq_length, as sequence length differs + # but not the freq_length inputs = torch.nn.utils.rnn.pad_sequence(list(map(lambda x: x[0].transpose(0,1), batch)), batch_first=True) - #inputs need to be transposed back from shape batch_size x sequence_length x freq_length - #to batch_size x freq_length x sequence_length. Additionally, unsqueeze tensor - inputs = inputs.transpose(1,2).unsqueeze(1) - input_lengths = torch.IntTensor(list(map(lambda x: x[0].size(1), batch))) #create tensor of input lengths + # inputs need to be transposed back from shape batch_size x sequence_length x freq_length + # to batch_size x freq_length x sequence_length. Additionally, unsqueeze tensor + inputs = inputs.transpose(1, 2).unsqueeze(1) + input_lengths = torch.IntTensor(list(map(lambda x: x[0].size(1), batch))) # create tensor of input lengths - targets_arr = list(zip(*batch))[1] #extract targets array from batch ( batch is array of tuples) - target_lengths = torch.IntTensor(list(map(lambda x: len(x),targets_arr))) #create tensor of target lengths - targets = torch.cat(list(map(lambda x: torch.IntTensor(x), targets_arr))) #create tensor of targets + targets_arr = list(zip(*batch))[1] # extract targets array from batch ( batch is array of tuples) + target_lengths = torch.IntTensor(list(map(lambda x: len(x), targets_arr))) # create tensor of target lengths + targets = torch.cat(list(map(lambda x: torch.IntTensor(x), targets_arr))) # create tensor of targets return inputs, targets, input_lengths, target_lengths diff --git a/sonosco/loader.py b/sonosco/loader.py deleted file mode 100644 index b73b065..0000000 --- a/sonosco/loader.py +++ /dev/null @@ -1,250 +0,0 @@ -# ---------------------------------------------------------------------------- -# Based on SeanNaren's deepspeech.pytorch: -# https://github.com/SeanNaren/deepspeech.pytorch -# ---------------------------------------------------------------------------- - -import math -import warnings -from typing import Tuple - -import librosa -import numpy as np -import torch -import torchaudio -from scipy import signal -from torch.utils.data import Dataset, DataLoader, Sampler - -# FIXME: Deprecated functions usage -from torch.distributed.deprecated import get_rank -from torch.distributed.deprecated import get_world_size - -windows = {"bartlett": torch.bartlett_window, - "blackman": torch.blackman_window, - "hamming": torch.hamming_window, - "hann": torch.hann_window} - -windows_legacy = {'hamming': signal.hamming, - 'hann': signal.hann, - 'blackman': signal.blackman, - 'bartlett': signal.bartlett} - - -class DataProcessor(object): - def __init__(self, audio_conf, labels="abc", normalize=False, augment=False, legacy=True): - """ - Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by - a comma. Each new line is a different sample. Example below: - - /path/to/audio.wav,/path/to/audio.txt - ... - - :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds - :param labels: String containing all the possible characters to map to - :param normalize: Apply standard mean and deviation normalization to audio tensor - :param augment(default False): Apply random tempo and gain perturbations - """ - self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) - self.window_stride = audio_conf["window_stride"] - self.window_size = audio_conf["window_size"] - self.sample_rate = audio_conf["sample_rate"] - self.window = windows_legacy.get(audio_conf["window"], windows_legacy["hamming"]) if legacy else windows.get(audio_conf["window"], windows["hamming"]) - self.normalize = normalize - self.augment = augment - self.legacy = legacy - self.transform = torchaudio.transforms.Spectrogram(n_fft=int(self.sample_rate * self.window_size), - hop=int(self.sample_rate * self.window_stride), - window=self.window, normalize=self.normalize) - - @staticmethod - def retrieve_file(audio_path, legacy=True): - sound, sample_rate = torchaudio.load(audio_path) - if legacy: - sound = sound.numpy().T - if len(sound.shape) > 1: - if sound.shape[1] == 1: - sound = sound.squeeze() - else: - sound = sound.mean(axis=1) - return sound, sample_rate - - @staticmethod - def augment_audio(sound, tempo_range: Tuple = (0.85, 1.15), gain_range: Tuple = (-6, 8)): - """ - Changes tempo and gain of the wave - """ - warnings.warn("Augmentation is not implemented") # TODO: Implement - return sound - - def parse_audio(self, audio_path): - sound, sample_rate = self.retrieve_file(audio_path, self.legacy) - if sample_rate != self.sample_rate: - raise ValueError(f"The stated sample rate {self.sample_rate} and the factual rate {sample_rate} differ!") - - if self.augment: - sound = self.augment_audio(sound) - - if self.legacy: - n_fft = int(self.sample_rate * self.window_size) - win_length = n_fft - hop_length = int(self.sample_rate * self.window_stride) - # STFT - D = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length, - win_length=win_length, window=self.window) - spectrogram, phase = librosa.magphase(D) - # S = log(S+1) - - spectrogram = torch.FloatTensor(np.log1p(spectrogram)) - else: - # TODO: Why these are different from librosa.stft? - sound = sound.cuda() - spectrogram = self.transform(sound)[-1, :, :].transpose(0, 1) - - # spectrogram = torch.stft(torch.from_numpy(sound.numpy().T.squeeze()), - # n_fft=int(self.sample_rate * self.window_size), - # hop_length=int(self.sample_rate * self.window_stride), - # win_length=int(self.sample_rate * self.window_size), - # window=torch.hamming_window(int(self.sample_rate * self.window_size)))[:, :, -1] - - if self.normalize: - mean = spectrogram.mean() - std = spectrogram.std() - spectrogram.add_(-mean) - spectrogram.div_(std) - - return spectrogram - - def parse_transcript(self, transcript_path): - with open(transcript_path, 'r', encoding='utf8') as transcript_file: - transcript = transcript_file.read().replace('\n', '') - # TODO: Is it fast enough? - transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)])) - return transcript - - -class AudioDataset(Dataset): - def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False, legacy=True): - """ - Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by - a comma. Each new line is a different sample. Example below: - - /path/to/audio.wav,/path/to/audio.txt - ... - - :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds - :param manifest_filepath: Path to manifest csv as describe above - :param labels: String containing all the possible characters to map to - :param normalize: Apply standard mean and deviation normalization to audio tensor - :param augment(default False): Apply random tempo and gain perturbations - """ - super(AudioDataset, self).__init__() - with open(manifest_filepath) as f: - ids = f.readlines() - ids = [x.strip().split(',') for x in ids] - self.ids = ids - self.size = len(ids) - self.processor = DataProcessor(audio_conf, labels, normalize, augment, legacy) - - def __getitem__(self, index): - sample = self.ids[index] - audio_path, transcript_path = sample[0], sample[1] - - spectrogram = self.processor.parse_audio(audio_path) - transcript = self.processor.parse_transcript(transcript_path) - - return spectrogram, transcript - - def __len__(self): - return self.size - - -# TODO: Optimise -def _collate_fn(batch): - batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) - longest_sample = batch[0][0] - freq_size, max_seqlength = longest_sample.size() - minibatch_size = len(batch) - inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength) - input_percentages = torch.FloatTensor(minibatch_size) - target_sizes = np.zeros(minibatch_size, dtype=np.int32) - - # TODO: Numpy broadcasting magic - targets = [] - - for x in range(minibatch_size): - inputs[x][0].narrow(1, 0, batch[x][0].size(1)).copy_(batch[x][0]) - input_percentages[x] = batch[x][0].size(1) / float(max_seqlength) - target_sizes[x] = len(batch[x][1]) - targets.extend(batch[x][1]) - - return inputs, torch.IntTensor(targets), input_percentages, torch.from_numpy(target_sizes) - - -class AudioDataLoader(DataLoader): - def __init__(self, *args, **kwargs): - """ - Creates a data loader for AudioDatasets. - """ - super(AudioDataLoader, self).__init__(*args, **kwargs) - self.collate_fn = _collate_fn - - -class BucketingSampler(Sampler): - def __init__(self, data_source, batch_size=1): - """ - Samples batches assuming they are in order of size to batch similarly sized samples together. - """ - super(BucketingSampler, self).__init__(data_source) - self.data_source = data_source - ids = list(range(0, len(data_source))) - # TODO: Optimise - self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)] - - def __iter__(self): - for ids in self.bins: - np.random.shuffle(ids) - yield ids - - def __len__(self): - return len(self.bins) - - def shuffle(self, epoch): - np.random.shuffle(self.bins) - - -# TODO: Optimise -class DistributedBucketingSampler(Sampler): - def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None): - """ - Samples batches assuming they are in order of size to batch similarly sized samples together. - """ - super(DistributedBucketingSampler, self).__init__(data_source) - if num_replicas is None: - num_replicas = get_world_size() - if rank is None: - rank = get_rank() - self.data_source = data_source - self.ids = list(range(0, len(data_source))) - self.batch_size = batch_size - self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)] - self.num_replicas = num_replicas - self.rank = rank - self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas)) - self.total_size = self.num_samples * self.num_replicas - - def __iter__(self): - offset = self.rank - # add extra samples to make it evenly divisible - bins = self.bins + self.bins[:(self.total_size - len(self.bins))] - assert len(bins) == self.total_size - samples = bins[offset::self.num_replicas] # Get every Nth bin, starting from rank - return iter(samples) - - def __len__(self): - return self.num_samples - - def shuffle(self, epoch): - # deterministically shuffle based on epoch - g = torch.Generator() - g.manual_seed(epoch) - bin_ids = list(torch.randperm(len(self.bins), generator=g)) - self.bins = [self.bins[i] for i in bin_ids] diff --git a/sonosco/models/deepspeech2.py b/sonosco/models/deepspeech2.py index 35645d2..0fff74f 100644 --- a/sonosco/models/deepspeech2.py +++ b/sonosco/models/deepspeech2.py @@ -2,111 +2,16 @@ # Based on SeanNaren's deepspeech.pytorch: # https://github.com/SeanNaren/deepspeech.pytorch # ---------------------------------------------------------------------------- - import math -from collections import OrderedDict - import torch import logging import torch.nn as nn -import torch.nn.functional as F +from collections import OrderedDict +from .modules import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, supported_rnns_inv LOGGER = logging.getLogger(__name__) -supported_rnns = { - 'lstm': nn.LSTM, - 'rnn': nn.RNN, - 'gru': nn.GRU -} -supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items()) - - -class SequenceWise(nn.Module): - def __init__(self, module): - """ - Collapses input of dim T*N*H to (T*N)*H, and applies to a module. - Allows handling of variable sequence lengths and minibatch sizes. - :param module: Module to apply input to. - """ - super(SequenceWise, self).__init__() - self.module = module - - def forward(self, x): - t, n = x.size(0), x.size(1) - x = x.view(t * n, -1) - x = self.module(x) - x = x.view(t, n, -1) - return x - - def __repr__(self): - tmpstr = self.__class__.__name__ + ' (\n' - tmpstr += self.module.__repr__() - tmpstr += ')' - return tmpstr - - -class MaskConv(nn.Module): - def __init__(self, seq_module): - """ - Adds padding to the output of the module based on the given lengths. This is to ensure that the - results of the model do not change when batch sizes change during inference. - Input needs to be in the shape of (BxCxDxT) - :param seq_module: The sequential module containing the conv stack. - """ - super(MaskConv, self).__init__() - self.seq_module = seq_module - - def forward(self, x, lengths): - """ - :param x: The input of size BxCxDxT - :param lengths: The actual length of each sequence in the batch - :return: Masked output from the module - """ - for module in self.seq_module: - x = module(x) - mask = torch.ByteTensor(x.size()).fill_(0) - if x.is_cuda: - mask = mask.cuda() - for i, length in enumerate(lengths): - length = length.item() - if (mask[i].size(2) - length) > 0: - mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1) - x = x.masked_fill(mask, 0) - return x, lengths - - -class InferenceBatchSoftmax(nn.Module): - def forward(self, input_): - if not self.training: - return F.softmax(input_, dim=-1) - else: - return input_ - - -class BatchRNN(nn.Module): - def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=False): - super(BatchRNN, self).__init__() - self.bidirectional = bidirectional - self.input_size = input_size - self.hidden_size = hidden_size - self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None - self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, - bidirectional=bidirectional, bias=True) - - def flatten_parameters(self): - self.rnn.flatten_parameters() - - def forward(self, x, output_lengths): - if self.batch_norm is not None: - x = self.batch_norm(x) - x = nn.utils.rnn.pack_padded_sequence(x, output_lengths) - x, h = self.rnn(x) - x, _ = nn.utils.rnn.pad_packed_sequence(x) - if self.bidirectional: - x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum - return x - class DeepSpeech2(nn.Module): def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hid_size=768, nb_layers=5, audio_conf=None, diff --git a/sonosco/models/deepspeech2_sonosco.py b/sonosco/models/deepspeech2_sonosco.py index 1e1b1c3..c95e7eb 100644 --- a/sonosco/models/deepspeech2_sonosco.py +++ b/sonosco/models/deepspeech2_sonosco.py @@ -1,13 +1,10 @@ import math -from collections import OrderedDict -from dataclasses import field -import torch from torch import nn - -from models.deepspeech2 import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, \ - supported_rnns_inv -from serialization import serializable +from .modules import MaskConv, BatchRNN, SequenceWise, InferenceBatchSoftmax, supported_rnns, supported_rnns_inv +from .serialization import serializable +from collections import OrderedDict +from dataclasses import field @serializable diff --git a/sonosco/model.py b/sonosco/models/loader.py similarity index 66% rename from sonosco/model.py rename to sonosco/models/loader.py index c943244..66e0b11 100644 --- a/sonosco/model.py +++ b/sonosco/models/loader.py @@ -1,64 +1,13 @@ import logging - import torch import deprecation -import inspect import torch.nn as nn -from common.class_utils import get_constructor_args, get_class_by_name -from serialization import is_serializable +from sonosco.common.class_utils import get_constructor_args, get_class_by_name LOGGER = logging.getLogger(__name__) -class Saver: - - def __init__(self) -> None: - super().__init__() - - @deprecation.deprecated( - details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead") - def save_model_simple(self, model: nn.Module, path: str) -> None: - """ - Simply saves the model using pickle protocol. - Args: - model (nn.Module): model to save - path (str) : path where to save the model - - Returns: - - """ - torch.save(model, path) - - def save_model(self, model: nn.Module, path: str) -> None: - """ - Saves the model using pickle protocol. - - If the infer_structure is True this method infers all the meta parameters of the model and save them together - with learnable parameters. - - If the infer_structure is False and method specified by serialize_method_name exists, the return value of the - serialize_method_name method is saved. - - If neither of above only learnable parameters a.k.a. state_dict are saved. - - Args: - model (nn.Module): model to save - path (str) : path where to save the model - infer_structure (bool): indicator whether to infer the model structure - serialize_method_name (str): name of the function that this method should call in order to serialize the - model. Must return dict. - - Returns: - - """ - if is_serializable(model): - entity_to_save = model.__serialize__() - torch.save(entity_to_save, path) - else: - raise TypeError("Only @serializable class can be serialized") - - class Loader: @deprecation.deprecated( diff --git a/sonosco/models/modules.py b/sonosco/models/modules.py new file mode 100644 index 0000000..015dc5a --- /dev/null +++ b/sonosco/models/modules.py @@ -0,0 +1,101 @@ +import torch +import logging +import torch.nn as nn +import torch.nn.functional as functional + + +LOGGER = logging.getLogger(__name__) + +supported_rnns = { + 'lstm': nn.LSTM, + 'rnn': nn.RNN, + 'gru': nn.GRU +} + +supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items()) + + +class SequenceWise(nn.Module): + def __init__(self, module): + """ + Collapses input of dim T*N*H to (T*N)*H, and applies to a module. + Allows handling of variable sequence lengths and minibatch sizes. + :param module: Module to apply input to. + """ + super(SequenceWise, self).__init__() + self.module = module + + def forward(self, x): + t, n = x.size(0), x.size(1) + x = x.view(t * n, -1) + x = self.module(x) + x = x.view(t, n, -1) + return x + + def __repr__(self): + tmpstr = self.__class__.__name__ + ' (\n' + tmpstr += self.module.__repr__() + tmpstr += ')' + return tmpstr + + +class MaskConv(nn.Module): + def __init__(self, seq_module): + """ + Adds padding to the output of the module based on the given lengths. This is to ensure that the + results of the model do not change when batch sizes change during inference. + Input needs to be in the shape of (BxCxDxT) + :param seq_module: The sequential module containing the conv stack. + """ + super(MaskConv, self).__init__() + self.seq_module = seq_module + + def forward(self, x, lengths): + """ + :param x: The input of size BxCxDxT + :param lengths: The actual length of each sequence in the batch + :return: Masked output from the module + """ + for module in self.seq_module: + x = module(x) + mask = torch.ByteTensor(x.size()).fill_(0) + if x.is_cuda: + mask = mask.cuda() + for i, length in enumerate(lengths): + length = length.item() + if (mask[i].size(2) - length) > 0: + mask[i].narrow(2, length, mask[i].size(2) - length).fill_(1) + x = x.masked_fill(mask, 0) + return x, lengths + + +class InferenceBatchSoftmax(nn.Module): + def forward(self, input_): + if not self.training: + return functional.softmax(input_, dim=-1) + else: + return input_ + + +class BatchRNN(nn.Module): + def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, batch_norm=True, bidirectional=False): + super(BatchRNN, self).__init__() + self.bidirectional = bidirectional + self.input_size = input_size + self.hidden_size = hidden_size + self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None + self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size, + bidirectional=bidirectional, bias=True) + + def flatten_parameters(self): + self.rnn.flatten_parameters() + + def forward(self, x, output_lengths): + if self.batch_norm is not None: + x = self.batch_norm(x) + x = nn.utils.rnn.pack_padded_sequence(x, output_lengths) + x, h = self.rnn(x) + x, _ = nn.utils.rnn.pad_packed_sequence(x) + if self.bidirectional: + x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) # (TxNxH*2) -> (TxNxH) by sum + return x diff --git a/sonosco/models/saver.py b/sonosco/models/saver.py new file mode 100644 index 0000000..4198a2c --- /dev/null +++ b/sonosco/models/saver.py @@ -0,0 +1,56 @@ +import logging +import torch +import deprecation +import torch.nn as nn + +from .serialization import is_serializable + +LOGGER = logging.getLogger(__name__) + + +class Saver: + + def __init__(self) -> None: + super().__init__() + + @deprecation.deprecated( + details="This type of saving may cause problems when path of model class changes. Pleas use save_model instead") + def save_model_simple(self, model: nn.Module, path: str) -> None: + """ + Simply saves the model using pickle protocol. + Args: + model (nn.Module): model to save + path (str) : path where to save the model + + Returns: + + """ + torch.save(model, path) + + def save_model(self, model: nn.Module, path: str) -> None: + """ + Saves the model using pickle protocol. + + If the infer_structure is True this method infers all the meta parameters of the model and save them together + with learnable parameters. + + If the infer_structure is False and method specified by serialize_method_name exists, the return value of the + serialize_method_name method is saved. + + If neither of above only learnable parameters a.k.a. state_dict are saved. + + Args: + model (nn.Module): model to save + path (str) : path where to save the model + infer_structure (bool): indicator whether to infer the model structure + serialize_method_name (str): name of the function that this method should call in order to serialize the + model. Must return dict. + + Returns: + + """ + if is_serializable(model): + entity_to_save = model.__serialize__() + torch.save(entity_to_save, path) + else: + raise TypeError("Only @serializable class can be serialized") diff --git a/sonosco/serialization.py b/sonosco/models/serialization.py similarity index 100% rename from sonosco/serialization.py rename to sonosco/models/serialization.py From eee49f6b53fb785c0ec189eb56289b9e2c71356d Mon Sep 17 00:00:00 2001 From: yuriyarabskyy Date: Tue, 25 Jun 2019 14:20:19 +0200 Subject: [PATCH 58/58] use f strings --- sonosco/models/serialization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sonosco/models/serialization.py b/sonosco/models/serialization.py index 9f905b6..4f824af 100644 --- a/sonosco/models/serialization.py +++ b/sonosco/models/serialization.py @@ -49,7 +49,7 @@ def __create_serialize_body(cls, fields_to_serialize): elif is_dataclass(field.type): body_lines.append(__create_dict_entry(field.name, f"self.{field.name}.__serialize__()")) elif __is_nn_class(field.type): - body_lines.append("'{}': {".format(field.name)) + body_lines.append(f"'{field.name}': {{") __extract_from_nn(cls, body_lines) body_lines.append("}") else: @@ -75,7 +75,7 @@ def __throw_unsupported_data_type(): def __create_dict_entry(key, value): - return f'\'{key}\': {value},' + return f"'{key}': {value}," def __is_primitive(obj):