-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
153 lines (111 loc) · 4.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import audioop
import wave
from timeit import default_timer as timer
import numpy as np
from deepspeech import Model
from jumpcutter.clip import Clip
from moviepy.editor import *
from pydub import AudioSegment
LANGUAGE = 'DE'
MODEL_PATH = ('deepspeech-0.9.3-models.pbmm', 'output_graph.pbmm')[LANGUAGE == 'DE']
SCORER_PATH = ('deepspeech-0.9.3-models.scorer', 'kenlm.scorer')[LANGUAGE == 'DE']
WPM = 200
def get_wav_from_video(filename):
audioclip = AudioFileClip('res/' + filename + '.mp4')
audioclip.write_audiofile('res/' + filename + '_audio.wav')
audio_file = wave.open('res/' + filename + '_audio.wav', 'r')
audio_data = audio_file.readframes(audio_file.getnframes())
og_frames = audio_file.getnframes()
og_rate = audio_file.getframerate()
og_duration = og_frames / float(og_rate)
out_file = wave.open('res/' + filename + '_audio_16k_stereo.wav', 'w')
out_file.setnchannels(1)
out_file.setparams((2, 2, 16000, 0, 'NONE', 'Uncompressed'))
converted_audio = audioop.ratecv(audio_data, 2, 2, og_rate, 16000, None)
out_file.writeframes(converted_audio[0])
out_file.close()
audio_file.close()
sound = AudioSegment.from_wav('res/' + filename + '_audio_16k_stereo.wav')
sound = sound.set_channels(1)
sound.export('res/' + filename + '_audio_16k.wav', format="wav")
return 'res/' + filename + '_audio_16k.wav', og_duration
def get_string_from_wav(wav_path):
model_load_start = timer()
ds = Model(MODEL_PATH)
model_load_end = timer() - model_load_start
print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr)
scorer_load_start = timer()
ds.enableExternalScorer(SCORER_PATH)
scorer_load_end = timer() - scorer_load_start
print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr)
fin = wave.open(wav_path, 'rb')
fs_orig = fin.getframerate()
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
audio_length = fin.getnframes() * (1 / fs_orig)
fin.close()
print('Running inference.', file=sys.stderr)
inference_start = timer()
string = ds.stt(audio)
inference_end = timer() - inference_start
print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
return string
def cut_out_silent_parts(path):
input_path = 'res/' + path + '.mp4'
output_path = 'res/' + path + '_voiced.mp4'
cuts = ["silent"]
codec = None
bitrate = None
clip = Clip(str(input_path), -1, None)
outputs = clip.jumpcut(
cuts,
0.02,
0.5,
0.1,
0.1,
)
for cut_type, jumpcutted_clip in outputs.items():
if len(outputs) == 2:
jumpcutted_clip.write_videofile(
str(
output_path.parent
/ f"{output_path.stem}_{cut_type}_parts_cutted{output_path.suffix}"
),
codec=codec,
bitrate=bitrate,
)
else:
jumpcutted_clip.write_videofile(
str(output_path), codec=codec, bitrate=bitrate
)
def speed_up_video(input_file, string, duration):
word_count = len(string.split(' '))
print("Word count: %i" % word_count)
og_wpm = 60.0 * word_count / duration
print("Original Words Per Minute after cutting out silent Parts: %.2f" % og_wpm)
speed_factor = WPM / og_wpm
input_path = 'res/' + input_file + '.mp4'
output_path = 'res/' + input_file + '_custom_speed.mp4'
os.system('ffmpeg -i {} -filter_complex "[0:v]setpts={}*PTS[v];[0:a]atempo={}[a]" -map "[v]" -map "[a]" {}'.format(
input_path, (1 / speed_factor), speed_factor, output_path))
def remove_video_files(name):
os.remove('res/' + name + '_voiced.mp4')
os.remove('res/' + name + '_voiced_audio.wav')
os.remove('res/' + name + '_voiced_audio_16k.wav')
os.remove('res/' + name + '_voiced_audio_16k_stereo.wav')
def process_single_video(name):
cut_out_silent_parts(name)
wav_out_path, video_duration = get_wav_from_video(name + '_voiced')
print('Duration: %.2fs' % video_duration)
out_string = get_string_from_wav(wav_out_path)
print(out_string)
print(len(out_string.split(' ')))
speed_up_video(name + '_voiced', out_string, video_duration)
remove_video_files(name)
def get_file_name(name):
return name.split('.')[0]
def process_videos_in_dir(dir_name):
files = list(map(get_file_name, os.listdir('res/' + dir_name)))
for f in files:
print('Working on file: ' + f)
process_single_video(dir_name + '/' + f)
process_videos_in_dir('convert')