Skip to content

Commit

Permalink
Add pad_audio and convert_to_sequences functions in FeatureExtractor
Browse files Browse the repository at this point in the history
  • Loading branch information
pzinemanas committed Mar 5, 2021
1 parent e72574a commit 5786ab6
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 52 deletions.
34 changes: 34 additions & 0 deletions dcase_models/data/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import librosa
import soundfile as sf
import json
from scipy.stats import kurtosis, skew

from ..util.files import load_json, mkdir_if_not_exists
from ..util.files import duplicate_folder_structure
Expand Down Expand Up @@ -336,3 +337,36 @@ def get_features_path(self, dataset):
dataset.dataset_path, self.features_folder, feature_name
)
return features_path

def pad_audio(self, audio):
if (self.sequence_time > 0) & (self.pad_mode is not None):
if self.sequence_hop_time > 0:
audio = librosa.util.fix_length(
audio,
audio.shape[0] + librosa.core.frames_to_samples(
self.sequence_frames, self.audio_hop, n_fft=self.n_fft),
axis=0, mode=self.pad_mode
)
else:
sequence_samples = librosa.core.frames_to_samples(
self.sequence_frames, self.audio_hop, n_fft=self.n_fft)
if len(audio) < sequence_samples:
audio = librosa.util.fix_length(
audio, sequence_samples, axis=0, mode=self.pad_mode)
else:
audio = audio[:sequence_samples]
return audio

def convert_to_sequences(self, audio_representation):
if (self.sequence_time > 0) & (self.sequence_hop_time > 0):
audio_representation = np.ascontiguousarray(audio_representation)
audio_representation = librosa.util.frame(
audio_representation,
self.sequence_frames,
self.sequence_hop,
axis=0
)
else:
audio_representation = np.expand_dims(audio_representation, axis=0)

return audio_representation
83 changes: 31 additions & 52 deletions dcase_models/data/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
from ..model.models import VGGish


__all__ = ['Spectrogram', 'MelSpectrogram',
'Openl3', 'RawAudio', 'FramesAudio']
__all__ = ['Spectrogram', 'MelSpectrogram', 'MFCC',
'Openl3', 'RawAudio', 'FramesAudio',
'VGGishEmbeddings']


class Spectrogram(FeatureExtractor):
Expand Down Expand Up @@ -78,13 +79,7 @@ def calculate(self, file_name):
audio = self.load_audio(file_name)

# Padding
if self.pad_mode is not None:
audio = librosa.util.fix_length(
audio,
audio.shape[0] + librosa.core.frames_to_samples(
self.sequence_frames, self.audio_hop, n_fft=self.n_fft),
axis=0, mode=self.pad_mode
)
audio = self.pad_audio(audio)

# Spectrogram, shape (N_frames, N_freqs)
stft = librosa.core.stft(audio, n_fft=self.n_fft,
Expand All @@ -102,10 +97,11 @@ def calculate(self, file_name):

# Convert to sequences (frames),
# shape (N_sequences, N_sequence_frames, N_freqs)
spectrogram = np.ascontiguousarray(spectrogram)
spectrogram = librosa.util.frame(
spectrogram, self.sequence_frames, self.sequence_hop, axis=0
)
# spectrogram = np.ascontiguousarray(spectrogram)
# spectrogram = librosa.util.frame(
# spectrogram, self.sequence_frames, self.sequence_hop, axis=0
# )
spectrogram = self.convert_to_sequences(spectrogram)

return spectrogram

Expand Down Expand Up @@ -195,14 +191,8 @@ def calculate(self, file_name):
# return None

# Pad audio signal
if self.pad_mode is not None:
audio = librosa.util.fix_length(
audio,
audio.shape[0] + librosa.core.frames_to_samples(
self.sequence_frames, self.audio_hop, n_fft=self.n_fft),
axis=0, mode=self.pad_mode
)

audio = self.pad_audio(audio)

# Get the spectrogram, shape (N_freqs, N_frames)
stft = librosa.core.stft(audio, n_fft=self.n_fft,
hop_length=self.audio_hop,
Expand All @@ -229,10 +219,7 @@ def calculate(self, file_name):

# Convert to sequences (frames),
# shape (N_sequences, N_sequence_frames, N_bands)
mel_spectrogram = np.ascontiguousarray(mel_spectrogram)
mel_spectrogram = librosa.util.frame(
mel_spectrogram, self.sequence_frames, self.sequence_hop, axis=0
)
mel_spectrogram = self.convert_to_sequences(mel_spectrogram)

return mel_spectrogram

Expand Down Expand Up @@ -330,13 +317,7 @@ def calculate(self, file_name):
# return None

# Pad audio signal
if self.pad_mode is not None:
audio = librosa.util.fix_length(
audio,
audio.shape[0] + librosa.core.frames_to_samples(
self.sequence_frames, self.audio_hop, n_fft=self.n_fft),
axis=0, mode=self.pad_mode
)
audio = self.pad_audio(audio)

# Get the spectrogram, shape (N_freqs, N_frames)
stft = librosa.core.stft(audio, n_fft=self.n_fft,
Expand Down Expand Up @@ -365,10 +346,11 @@ def calculate(self, file_name):

# Convert to sequences (frames),
# shape (N_sequences, N_sequence_frames, N_MFCC)
mfcc = np.ascontiguousarray(mfcc)
mfcc = librosa.util.frame(
mfcc, self.sequence_frames, self.sequence_hop, axis=0
)
# mfcc = np.ascontiguousarray(mfcc)
# mfcc = librosa.util.frame(
# mfcc, self.sequence_frames, self.sequence_hop, axis=0
# )
mfcc = self.convert_to_sequences(mfcc)

return mfcc

Expand Down Expand Up @@ -438,14 +420,14 @@ def __init__(self, sequence_time=1.0, sequence_hop_time=0.5,
self.content_type = content_type
self.input_repr = input_repr
self.embedding_size = embedding_size
self.openl3 = openl3.models.load_audio_embedding_model(
input_repr, content_type, embedding_size)

def calculate(self, file_name):
audio = self.load_audio(file_name, change_sampling_rate=False)
emb, ts = openl3.get_audio_embedding(
audio, self.sr,
content_type=self.content_type,
embedding_size=self.embedding_size,
input_repr=self.input_repr,
model=self.openl3,
hop_size=self.sequence_hop_time,
verbose=False
)
Expand Down Expand Up @@ -494,7 +476,7 @@ def calculate(self, file_name):

audio = np.ascontiguousarray(audio)
audio_seqs = librosa.util.frame(
audio, self.sequence_samples, self.sequence_hop_samples, axis=0
audio, self.sequence_samples, self.sequence_hop_samples, axis=0
)

return audio_seqs
Expand Down Expand Up @@ -534,23 +516,20 @@ def __init__(self, sequence_time=1.0, sequence_hop_time=0.5,
def calculate(self, file_name):
audio = self.load_audio(file_name, change_sampling_rate=False)

if self.pad_mode is not None:
audio = librosa.util.fix_length(
audio,
audio.shape[0] + self.sequence_samples,
axis=0, mode=self.pad_mode
)
audio = self.pad_audio(audio)

audio = np.ascontiguousarray(audio)
audio_frames = librosa.util.frame(
audio, self.audio_win, self.audio_hop, axis=0
)
# TODO: ADD WINDOWING

audio_frames = np.ascontiguousarray(audio_frames)
audio_seqs = librosa.util.frame(
audio_frames, self.sequence_frames, self.sequence_hop, axis=0
)
# audio_frames = np.ascontiguousarray(audio_frames)
# audio_seqs = librosa.util.frame(
# audio_frames, self.sequence_frames, self.sequence_hop, axis=0
# )

audio_seqs = self.convert_to_sequences(audio_frames)

return audio_seqs

Expand All @@ -569,7 +548,7 @@ class VGGishEmbeddings(FeatureExtractor):
"""
def __init__(self, sequence_hop_time=0.96,
pad_mode='reflect'):
pad_mode='reflect', include_top=True, compress=True):

sequence_time = 0.96
audio_win = 400
Expand All @@ -596,7 +575,7 @@ def __init__(self, sequence_hop_time=0.96,
self.vggish = VGGish(
model=None, model_path=None, metrics=[],
n_frames_cnn=96, n_freq_cnn=64, n_classes=0,
embedding_size=128, pooling='avg', include_top=True, compress=True)
embedding_size=128, pooling='avg', include_top=include_top, compress=compress)

self.vggish.load_pretrained_model_weights()

Expand Down

0 comments on commit 5786ab6

Please sign in to comment.