forked from lbrayner/vtt-to-srt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvtt_to_srt.py
85 lines (66 loc) · 3.06 KB
/
vtt_to_srt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
import argparse
import html
import os
import re
import sys
from pysrt.srtitem import SubRipItem, SubRipTime
from webvtt import WebVTT
def replace_colors(raw_text, colours_arg, tag_name):
result = raw_text
for k, v in colours_arg.items():
regex_string = "<" + tag_name + "(?:\\..*?)?\\." + str(k) + "(?:\\..*?)?>(.*?)</" + tag_name + ">"
if re.search(regex_string, result) is not None:
result = re.sub(regex_string, lambda x: replace_color(x, tag_name, v), result)
return result
def replace_color(x, tag_name, v):
return ("" if tag_name == "c" else ("<" + tag_name + ">")) \
+ "<font color=\"" + v + "\">" \
+ html.unescape(x.group(1)) \
+ "</font>" \
+ ("" if tag_name == "c" else ("</" + tag_name + ">"))
COLOURS_PATTERN = re.compile(r'::cue\(\.([^)]+)\)\s*{.*?color:(.*?);.*?}')
def main():
parser = argparse.ArgumentParser(
description='vtt_to_srt is a command line tool to convert vtt subtitles to srt files')
parser.add_argument('file', nargs='*',
help='a file. The command accepts zero, one or more files as arguments.\n'
'For each .vtt, a .srt will be generated in the same folder.\n'
'Any other extension is ignored.')
parser.add_argument('-s', dest='strip', action='store_true', help='strip all tags in output srt')
args = parser.parse_args()
if len(args.file) == 0:
for file in os.listdir():
if file.endswith(".vtt"):
args.file.append(file)
for file in args.file:
index = 0
file_name, file_extension = os.path.splitext(file)
if not file_extension.lower() == ".vtt":
sys.stderr.write("Skipping %s.\n" % file)
continue
srt = open(file_name + ".srt", "w", encoding='utf-8')
read = WebVTT().read(file)
colours = dict()
if args.strip is False:
for style in read.styles:
colours_found = COLOURS_PATTERN.findall(style.text)
colours_classes = list(map(lambda x: x[0], colours_found))
colours_values = list(map(lambda x: x[1].replace(" ", ""), colours_found))
colours = dict(zip(colours_classes, colours_values))
for caption in read.captions:
index += 1
start = SubRipTime(0, 0, caption.start_in_seconds)
end = SubRipTime(0, 0, caption.end_in_seconds)
caption_text = caption.raw_text
no_tag_found = True
if args.strip is False:
for tag in ['c', 'i', 'b', 'u']:
if re.search("<" + tag + "\\..*?>.*?</" + tag + ">", caption_text) is not None:
caption_text = replace_colors(caption_text, colours, tag)
no_tag_found = False
if no_tag_found:
caption_text = html.unescape(caption.text)
srt.write(SubRipItem(index, start, end, caption_text).__str__() + "\n")
if __name__ == "__main__":
main()