-
Notifications
You must be signed in to change notification settings - Fork 0
/
wav2vec_manifest.py
56 lines (44 loc) · 2.12 KB
/
wav2vec_manifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Data pre-processing: build vocabularies and binarize training data.
"""
import argparse
import glob
import os
import soundfile
import random
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('root', metavar='DIR', help='root directory containing flac files to index')
parser.add_argument('--valid-percent', default=0.01, type=float, metavar='D',
help='percentage of data to use as validation set (between 0 and 1)')
parser.add_argument('--dest', default='.', type=str, metavar='DIR', help='output directory')
parser.add_argument('--ext', default='wav', type=str, metavar='EXT', help='extension to look for')
parser.add_argument('--seed', default=42, type=int, metavar='N', help='random seed')
parser.add_argument('--path-must-contain', default=None, type=str, metavar='FRAG',
help='if set, path must contain this substring for a file to be included in the manifest')
return parser
def main(args):
assert args.valid_percent >= 0 and args.valid_percent <= 1.
dir_path = os.path.realpath(args.root)
search_path = os.path.join(dir_path, '**/*.' + args.ext)
rand = random.Random(args.seed)
with open(os.path.join(args.dest, 'train.tsv'), 'w') as train_f, open(
os.path.join(args.dest, 'valid.tsv'), 'w') as valid_f:
print(dir_path, file=train_f)
print(dir_path, file=valid_f)
for fname in glob.iglob(search_path, recursive=True):
file_path = os.path.realpath(fname)
if args.path_must_contain and args.path_must_contain not in file_path:
continue
frames = soundfile.info(fname).frames
dest = train_f if rand.random() > args.valid_percent else valid_f
print('{}\t{}'.format(os.path.relpath(file_path, dir_path), frames), file=dest)
if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
main(args)