forked from r9y9/deepvoice3_pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
json_meta.py
260 lines (214 loc) · 9.16 KB
/
json_meta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
'''
Started in 1945h, Mar 10, 2018
First done in 2103h, Mar 11, 2018
Test done in 2324h, Mar 11, 2018
Modified for HTK labeling in 1426h, Apr 21, 2018
by engiecat(github)
This makes r9y9/deepvoice3_pytorch compatible with json format of carpedm20/multi-speaker-tacotron-tensorflow and keithito/tacotron.
The json file is given per speaker, generated in the format of
(if completely aligned)
(path-to-the-audio):aligned text
(if partially aligned)
(path-to-the-audio):[candidate sentence - not aligned,recognized words]
(if non-aligned)
(path-to-the-audio):[recognized words]
is given per speaker.
(e.g. python preprocess.py json_meta "./datasets/LJSpeech_1_0/alignment.json,./datasets/GoTBookRev/alignment.json" "./datasets/LJ+GoTBookRev" --preset=./presets/deepvoice3_vctk.json )
usage:
python preprocess.py [option] <json_paths> <output_data_path>
options:
--preset Path of preset parameters (json).
-h --help show this help message and exit
'''
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
import audio
from nnmnkwii.io import hts
from hparams import hparams
from os.path import exists
import librosa
import json
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
json_paths = in_dir.split(',')
json_paths = [json_path.replace("'", "").replace('"',"") for json_path in json_paths]
num_speakers = len(json_paths)
is_aligned = {}
speaker_id=0
for json_path in json_paths:
# Loads json metadata info
if json_path.endswith("json"):
with open(json_path, encoding='utf8') as f:
content = f.read()
info = json.loads(content)
elif json_path.endswith("csv"):
with open(json_path) as f:
info = {}
for line in f:
path, text = line.strip().split('|')
info[path] = text
else:
raise Exception(" [!] Unknown metadata format: {}".format(json_path))
print(" [*] Loaded - {}".format(json_path))
# check audio file existence
base_dir = os.path.dirname(json_path)
new_info = {}
for path in info.keys():
if not os.path.exists(path):
new_path = os.path.join(base_dir, path)
if not os.path.exists(new_path):
print(" [!] Audio not found: {}".format([path, new_path]))
continue
else:
new_path = path
new_info[new_path] = info[path]
info = new_info
# ignore_recognition_level check
for path in info.keys():
is_aligned[path] = True
if isinstance(info[path], list):
if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \
hparams.ignore_recognition_level == 2:
# flag the path to be 'non-aligned' text
is_aligned[path] = False
info[path] = info[path][0]
# Reserve for future processing
queue_count = 0
for audio_path, text in info.items():
if isinstance(text, list):
if hparams.ignore_recognition_level == 0:
text = text[-1]
else:
text = text[0]
if hparams.ignore_recognition_level > 0 and not is_aligned[audio_path]:
continue
if hparams.min_text > len(text):
continue
if num_speakers == 1:
# Single-speaker
futures.append(executor.submit(
partial(_process_utterance_single, out_dir, text, audio_path)))
else:
# Multi-speaker
futures.append(executor.submit(
partial(_process_utterance, out_dir, text, audio_path, speaker_id)))
queue_count += 1
print(" [*] Appended {} entries in the queue".format(queue_count))
# increase speaker_id
speaker_id += 1
# Show ignore_recognition_level description
ignore_description = {
0: "use all",
1: "ignore only unmatched_alignment",
2: "fully ignore recognition",
}
print(" [!] Skip recognition level: {} ({})". \
format(hparams.ignore_recognition_level,
ignore_description[hparams.ignore_recognition_level]))
if num_speakers == 1:
print(" [!] Single-speaker mode activated!")
else:
print(" [!] Multi-speaker({}) mode activated!".format(num_speakers))
# Now, Do the job!
results = [future.result() for future in tqdm(futures)]
# Remove entries with None (That has been filtered due to bad htk alginment (if process_only_htk_aligned is enabled in hparams)
results = [result for result in results if result != None]
return results
def start_at(labels):
has_silence = labels[0][-1] == "pau"
if not has_silence:
return labels[0][0]
for i in range(1, len(labels)):
if labels[i][-1] != "pau":
return labels[i][0]
assert False
def end_at(labels):
has_silence = labels[-1][-1] == "pau"
if not has_silence:
return labels[-1][1]
for i in range(len(labels) - 2, 0, -1):
if labels[i][-1] != "pau":
return labels[i][1]
assert False
def _process_utterance(out_dir, text, wav_path, speaker_id=None):
# check whether singlespeaker_mode
if speaker_id is None:
return _process_utterance_single(out_dir,text,wav_path)
# modified version of VCTK _process_utterance
sr = hparams.sample_rate
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
if not exists(lab_path):
lab_path = os.path.splitext(wav_path)[0]+'.lab'
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=25)
else:
if hparams.process_only_htk_aligned:
return None
wav, _ = librosa.effects.trim(wav, top_db=15)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
# Get filename from wav_path
wav_name = os.path.basename(wav_path)
wav_name = os.path.splitext(wav_name)[0]
# case if wave files across different speakers have the same naming format.
# e.g. Recording0.wav
spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name)
mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name)
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def _process_utterance_single(out_dir, text, wav_path):
# modified version of LJSpeech _process_utterance
# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
sr = hparams.sample_rate
# Added from the multispeaker version
lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
if not exists(lab_path):
lab_path = os.path.splitext(wav_path)[0]+'.lab'
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=25)
else:
if hparams.process_only_htk_aligned:
return None
wav, _ = librosa.effects.trim(wav, top_db=15)
# End added from the multispeaker version
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
# Compute a mel-scale spectrogram from the wav:
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
# Write the spectrograms to disk:
# Get filename from wav_path
wav_name = os.path.basename(wav_path)
wav_name = os.path.splitext(wav_name)[0]
spectrogram_filename = 'spec-{}.npy'.format(wav_name)
mel_filename = 'mel-{}.npy'.format(wav_name)
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text)