-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_download_word_recordings.py
executable file
·101 lines (87 loc) · 3.21 KB
/
01_download_word_recordings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
import argparse
import requests
import json
import codecs
from requests.exceptions import ConnectionError
from os.path import exists, isdir, join
def download_file(url, local_filename):
# Taken from here
# https://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py/16696317#16696317
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
# f.flush() commented by recommendation from J.F.Sebastian
return local_filename
if __name__ == '__main__':
parser = argparse.ArgumentParser(
"Download files from Open Speech Corpus"
)
parser.add_argument(
'--from',
default=1
)
parser.add_argument(
'--to',
default=500
)
parser.add_argument(
'--url',
default='http://openspeechcorpus.contraslash.com/api/isolated-words/list/'
)
parser.add_argument(
'--s3_prefix',
default='https://s3.amazonaws.com/contraslash/openspeechcorpus/media/audio-data/v2/'
)
parser.add_argument(
'--output_folder',
default='output_folder'
)
parser.add_argument(
'--output_file',
default='transcription.txt'
)
parser.add_argument(
'--skip_storage',
action='store_true'
)
args = vars(parser.parse_args())
url = args['url']
if args['from'] is not None or args['to'] is not None:
url += "?"
if args['from'] is not None:
url += "from={}&".format(args['from'])
if args['to'] is not None:
url += "to={}".format(args['to'])
print("Querying {}".format(url))
response = requests.get(url)
if response.status_code == 200:
json_data = json.loads(response.content.decode())
print("We get {} audio datas".format(len(json_data)))
if not exists(args['output_folder']):
print("Output folder does not exists")
exit(1)
if not isdir(args['output_folder']):
print("Output folder exists exists but is not a folder")
exit(2)
output_file = codecs.open(args['output_file'], 'a+')
for audio_data in json_data:
audio_id = audio_data['audio']['id']
file_name = "{}.mp4".format(join(args['output_folder'], str(audio_id)))
output_file.write("{},{}\n".format(file_name, audio_data['isolated_word']['text'].strip()))
if not exists(file_name) and not args.get("skip_storage", False):
print("Download file with id: {}".format(audio_id))
print("{}{}.mp4".format(args['s3_prefix'], audio_id))
try:
download_file(
"{}{}.mp4".format(args['s3_prefix'], audio_id),
file_name
)
except ConnectionError:
print("Error getting file {}".format(file_name))
output_file.close()
else:
print("Cannot connect to server, response status was {}".format(response.status_code))