forked from YerevaNN/Spoken-language-identification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
augment_data.py
99 lines (80 loc) · 3.76 KB
/
augment_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
from matplotlib import pyplot as plt
import scipy.io.wavfile as wav
from numpy.lib import stride_tricks
import PIL.Image as Image
import os
""" short time fourier transform of audio signal """
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(frameSize/2.0)), sig)
# cols for windowing
cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1
# zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
frames *= win
return np.fft.rfft(frames)
""" scale frequency axis logarithmically """
def logscale_spec(spec, sr=44100, factor=20., alpha=1.0, f0=0.9, fmax=1):
spec = spec[:, 0:256]
timebins, freqbins = np.shape(spec)
scale = np.linspace(0, 1, freqbins) #** factor
# http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=650310&url=http%3A%2F%2Fieeexplore.ieee.org%2Fiel4%2F89%2F14168%2F00650310
scale = np.array(map(lambda x: x * alpha if x <= f0 else (fmax-alpha*f0)/(fmax-f0)*(x-f0)+alpha*f0, scale))
scale *= (freqbins-1)/max(scale)
newspec = np.complex128(np.zeros([timebins, freqbins]))
allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1])
freqs = [0.0 for i in range(freqbins)]
totw = [0.0 for i in range(freqbins)]
for i in range(0, freqbins):
if (i < 1 or i + 1 >= freqbins):
newspec[:, i] += spec[:, i]
freqs[i] += allfreqs[i]
totw[i] += 1.0
continue
else:
# scale[15] = 17.2
w_up = scale[i] - np.floor(scale[i])
w_down = 1 - w_up
j = int(np.floor(scale[i]))
newspec[:, j] += w_down * spec[:, i]
freqs[j] += w_down * allfreqs[i]
totw[j] += w_down
newspec[:, j + 1] += w_up * spec[:, i]
freqs[j + 1] += w_up * allfreqs[i]
totw[j + 1] += w_up
for i in range(len(freqs)):
if (totw[i] > 1e-6):
freqs[i] /= totw[i]
return newspec, freqs
""" plot spectrogram"""
def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="gray", channel=0, name='tmp.png', alpha=1, offset=0):
samplerate, samples = wav.read(audiopath)
samples = samples[:, channel]
s = stft(samples, binsize)
sshow, freq = logscale_spec(s, factor=1, sr=samplerate, alpha=alpha)
sshow = sshow[2:, :]
ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
timebins, freqbins = np.shape(ims)
ims = np.transpose(ims)
ims = ims[0:256, offset:offset+768] # 0-11khz, ~9s interval
#print "ims.shape", ims.shape
image = Image.fromarray(ims)
image = image.convert('L')
image.save(name)
file = open('trainingData.csv', 'r')
for iter, line in enumerate(file.readlines()[1:]): # first line of traininData.csv is header (only for trainingData.csv)
filepath = line.split(',')[0]
filename = filepath[:-4]
wavfile = 'tmp.wav'
os.system('mpg123 -w ' + wavfile + ' /home/brainstorm/caffe/Data/mnt/3/language/train/mp3/' + filepath)
for augmentIdx in range(0, 20):
alpha = np.random.uniform(0.9, 1.1)
offset = np.random.randint(90)
plotstft(wavfile, channel=0, name='/home/brainstorm/data/language/train/pngaugm/'+filename+'.'+str(augmentIdx)+'.png',
alpha=alpha, offset=offset)
os.remove(wavfile)
print "processed %d files" % (iter + 1)