forked from Kyubyong/tacotron
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
132 lines (104 loc) · 3.9 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
#/usr/bin/python2
'''
By kyubyong park. [email protected].
https://www.github.com/kyubyong/tacotron
'''
from __future__ import print_function
import copy
import librosa
from hyperparams import Hyperparams as hp
import numpy as np
import tensorflow as tf
def get_spectrograms(sound_file):
'''Extracts melspectrogram and log magnitude from given `sound_file`.
Args:
sound_file: A string. Full path of a sound file.
Returns:
Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2)
'''
# Loading sound file
y, sr = librosa.load(sound_file, sr=hp.sr) # or set sr to hp.sr.
# stft. D: (1+n_fft//2, T)
D = librosa.stft(y=y,
n_fft=hp.n_fft,
hop_length=hp.hop_length,
win_length=hp.win_length)
# magnitude spectrogram
magnitude = np.abs(D) #(1+n_fft/2, T)
# power spectrogram
power = magnitude**2 #(1+n_fft/2, T)
# mel spectrogram
S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T)
return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def shift_by_one(inputs):
'''Shifts the content of `inputs` to the right by one
so that it becomes the decoder inputs.
Args:
inputs: A 3d tensor with shape of [N, T, C]
Returns:
A 3d tensor with the same shape and dtype as `inputs`.
'''
return tf.concat((tf.zeros_like(inputs[:, :1, :]), inputs[:, :-1, :]), 1)
def reduce_frames(arry, step, r):
'''Reduces and adjust the shape and content of `arry` according to r.
Args:
arry: A 2d array with shape of [T, C]
step: An int. Overlapping span.
r: Reduction factor
Returns:
A 2d array with shape of [-1, C*r]
'''
T = arry.shape[0]
num_padding = (step*r) - (T % (step*r)) if T % (step*r) !=0 else 0
arry = np.pad(arry, [[0, num_padding], [0, 0]], 'constant', constant_values=(0, 0))
T, C = arry.shape
sliced = np.split(arry, list(range(step, T, step)), axis=0)
started = False
for i in range(0, len(sliced), r):
if not started:
reshaped = np.hstack(sliced[i:i+r])
started = True
else:
reshaped = np.vstack((reshaped, np.hstack(sliced[i:i+r])))
return reshaped
def spectrogram2wav(spectrogram):
'''
spectrogram: [t, f], i.e. [t, nfft // 2 + 1]
'''
spectrogram = spectrogram.T # [f, t]
X_best = copy.deepcopy(spectrogram) # [f, t]
for i in range(hp.n_iter):
X_t = invert_spectrogram(X_best)
est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) # [f, t]
phase = est / np.maximum(1e-8, np.abs(est)) # [f, t]
X_best = spectrogram * phase # [f, t]
X_t = invert_spectrogram(X_best)
return np.real(X_t)
def invert_spectrogram(spectrogram):
'''
spectrogram: [f, t]
'''
return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
def restore_shape(arry, step, r):
'''Reduces and adjust the shape and content of `arry` according to r.
Args:
arry: A 2d array with shape of [T, C]
step: An int. Overlapping span.
r: Reduction factor
Returns:
A 2d array with shape of [-1, C*r]
'''
T, C = arry.shape
sliced = np.split(arry, list(range(step, T, step)), axis=0)
started = False
for s in sliced:
if not started:
restored = np.vstack(np.split(s, r, axis=1))
started = True
else:
restored = np.vstack((restored, np.vstack(np.split(s, r, axis=1))))
# Trim zero paddings
restored = restored[:np.count_nonzero(restored.sum(axis=1))]
return restored