-
Notifications
You must be signed in to change notification settings - Fork 0
/
lstm_speech_data.py
69 lines (49 loc) · 2.25 KB
/
lstm_speech_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import os, librosa, math, json
audio_silence_treshold = 40
def dense_to_one_hot(labels_dense, num_classes=8):
"""Convert class labels from scalars to one-hot vectors."""
return np.eye(num_classes)[labels_dense]
def load_audio_data(file_path, mfcc_features=8, height=200):
json_file_path = file_path + str(mfcc_features) + '-' + str(height) + str(audio_silence_treshold) + '.json'
if os.path.exists(json_file_path):
with open(json_file_path, 'r') as file:
json_data = file.read()
return json.loads(json_data)
# 16000 Hz = VoIP
wave, sr = librosa.load(file_path, mono=True, sr=16000)
wave_frag_offsets = librosa.effects.split(wave, top_db=audio_silence_treshold)
results = []
for offsets in wave_frag_offsets:
wave_fragment = wave[offsets[0]:offsets[1]]
mfcc = librosa.feature.mfcc(wave_fragment, sr, n_mfcc=mfcc_features)
_, y_size = mfcc.shape
splitted_mfcc = np.array_split(mfcc, math.ceil(y_size / height), axis=1)
for short_mfcc in splitted_mfcc:
short_mfcc = np.pad(short_mfcc, ((0, 0), (0, height - len(short_mfcc[0]))), mode='constant', constant_values=0)
results.append(np.array(short_mfcc).tolist())
json_dump = json.dumps(results, separators=(',', ':'))
with open(json_file_path, 'w') as file:
file.write(json_dump)
return results
def load_dataset(dir_path):
dataset = []
for p in os.listdir(dir_path):
dataset.append(os.path.join(dir_path, p))
return dataset
def mfcc_get_batch(files, mfcc_features=8, height=200):
batch_features = []
labels = []
for wav in files:
if not wav.endswith(".wav"):
continue
# Our data is labeled 01-... but labels should be an int starting at 0
emotion = int(os.path.basename(wav).split('-')[0]) - 1
label = dense_to_one_hot(emotion, num_classes=8)
audio_data = load_audio_data(wav, mfcc_features=mfcc_features, height=height)
for mfcc in audio_data:
labels.append(label)
batch_features.append(mfcc)
if len(batch_features) % 100 == 0:
print('Loading Data Progress:', len(batch_features), '...')
return batch_features, labels