Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add --sub-cn-inline-limit and --sub-cn-modal-words by jionlp. #14

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 20 additions & 15 deletions autocut/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,41 @@

def main():
parser = argparse.ArgumentParser(description='Edit videos based on transcribed subtitles',
formatter_class=argparse.RawDescriptionHelpFormatter)
formatter_class=argparse.RawDescriptionHelpFormatter)

logging.basicConfig(format='[autocut:%(filename)s:L%(lineno)d] %(levelname)-6s %(message)s')
logging.getLogger().setLevel(logging.INFO)

parser.add_argument('inputs', type=str, nargs='+',
help='Inputs filenames/folders')
parser.add_argument('-t', '--transcribe', help='Transcribe videos/audio into subtitles',
action=argparse.BooleanOptionalAction)
action=argparse.BooleanOptionalAction)
parser.add_argument('-c', '--cut', help='Cut a video based on subtitles',
action=argparse.BooleanOptionalAction)
action=argparse.BooleanOptionalAction)
parser.add_argument('-d', '--daemon', help='Monitor a folder to trascribe and cut',
action=argparse.BooleanOptionalAction)
action=argparse.BooleanOptionalAction)
parser.add_argument('-s', help='Convert .srt to a compact format for easier editting',
action=argparse.BooleanOptionalAction)
action=argparse.BooleanOptionalAction)
parser.add_argument('--lang', type=str, default='zh',
choices=['zh', 'en'],
help='The output language of transcription')
parser.add_argument('--prompt', type=str, default='',
help='initial prompt feed into whisper')
choices=['zh', 'en'],
help='The output language of transcription')
parser.add_argument('--prompt', type=str, default='大家好,',
help='initial prompt feed into whisper')
parser.add_argument('--whisper-model', type=str, default='small',
choices=['tiny', 'base', 'small', 'medium', 'large'],
help='The whisper model used to transcribe.')
choices=['tiny', 'base', 'small', 'medium', 'large'],
help='The whisper model used to transcribe.')
parser.add_argument('--bitrate', type=str, default='10m',
help='The bitrate to export the cutted video, such as 10m, 1m, or 500k')
help='The bitrate to export the cutted video, such as 10m, 1m, or 500k')
parser.add_argument('--vad', help='If or not use VAD',
action=argparse.BooleanOptionalAction)
action=argparse.BooleanOptionalAction)
parser.add_argument('--force', help='Force write even if files exist',
action=argparse.BooleanOptionalAction)
action=argparse.BooleanOptionalAction)
parser.add_argument('--encoding', type=str, default='utf-8',
help='Document encoding format')
help='Document encoding format')
parser.add_argument('--sub-optimize-cn', help='Optimize the display of long sentences in subtitle for Chinese',
action=argparse.BooleanOptionalAction)
parser.add_argument('--modal-words-cn', type=str, default="啊,嗯,呢,呐,吧", # use English comma to separate
help='To filter the modal words in sentences for Chinese')

args = parser.parse_args()

Expand All @@ -54,5 +58,6 @@ def main():
else:
logging.warn('No action, use -c, -t or -d')


if __name__ == "__main__":
main()
45 changes: 24 additions & 21 deletions autocut/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,23 @@ def run(self):
for input in self.args.inputs:
logging.info(f'Transcribing {input}')
name, _ = os.path.splitext(input)
if utils.check_exists(name+'.md', self.args.force):
if utils.check_exists(name + '.md', self.args.force):
continue

audio = whisper.load_audio(input, sr=self.sampling_rate)
speech_timestamps = self._detect_voice_activity(audio)
transcribe_results = self._transcibe(audio, speech_timestamps)
transcribe_results = self._transcribe(audio, speech_timestamps)

output = name +'.srt'
output = name + '.srt'
self._save_srt(output, transcribe_results)
logging.info(f'Transcribed {input} to {output}')
self._save_md(name+'.md', output, input)
logging.info(f'Saved texts to {name+".md"} to mark sentences')
self._save_md(name + '.md', output, input)
logging.info(f'Saved texts to {name + ".md"} to mark sentences')

def _detect_voice_activity(self, audio):
"""Detect segments that have voice activities"""
if not self.args.vad:
return [{'start': 0, 'end':len(audio)}]
return [{'start': 0, 'end': len(audio)}]
tic = time.time()
if self.vad_model is None or self.detect_speech is None:
self.vad_model, funcs = torch.hub.load(
Expand All @@ -50,7 +50,7 @@ def _detect_voice_activity(self, audio):
self.detect_speech = funcs[0]

speeches = self.detect_speech(audio, self.vad_model,
sampling_rate=self.sampling_rate)
sampling_rate=self.sampling_rate)

# Merge very closed segments
# speeches = _merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)
Expand All @@ -59,13 +59,13 @@ def _detect_voice_activity(self, audio):
speeches = utils.remove_short_segments(speeches, 10.0 * self.sampling_rate)

# Expand to avoid to tight cut. You can tune the pad length
speeches = utils.expand_segments(speeches, 0.2*self.sampling_rate,
0.0*self.sampling_rate, audio.shape[0])
speeches = utils.expand_segments(speeches, 0.2 * self.sampling_rate,
0.0 * self.sampling_rate, audio.shape[0])

logging.info(f'Done voice activity detetion in {time.time()-tic:.1f} sec')
logging.info(f'Done voice activity detetion in {time.time() - tic:.1f} sec')
return speeches

def _transcibe(self, audio, speech_timestamps):
def _transcribe(self, audio, speech_timestamps):
tic = time.time()
if self.whisper_model is None:
self.whisper_model = whisper.load_model(self.args.whisper_model)
Expand All @@ -74,11 +74,11 @@ def _transcibe(self, audio, speech_timestamps):
# TODO, a better way is merging these segments into a single one, so whisper can get more context
for seg in speech_timestamps:
r = self.whisper_model.transcribe(
audio[int(seg['start']):int(seg['end'])],
task='transcribe', language=self.args.lang, initial_prompt=self.args.prompt)
audio[int(seg['start']):int(seg['end'])],
task='transcribe', language=self.args.lang, initial_prompt=self.args.prompt)
r['origin_timestamp'] = seg
res.append(r)
logging.info(f'Done transcription in {time.time()-tic:.1f} sec')
logging.info(f'Done transcription in {time.time() - tic:.1f} sec')
return res

def _save_srt(self, output, transcribe_results):
Expand All @@ -88,9 +88,9 @@ def _save_srt(self, output, transcribe_results):

def _add_sub(start, end, text):
subs.append(srt.Subtitle(index=0,
start=datetime.timedelta(seconds=start),
end=datetime.timedelta(seconds=end),
content=cc.convert(text.strip())))
start=datetime.timedelta(seconds=start),
end=datetime.timedelta(seconds=end),
content=cc.convert(text.strip())))

prev_end = 0
for r in transcribe_results:
Expand All @@ -106,7 +106,10 @@ def _add_sub(start, end, text):
_add_sub(start, end, s["text"])
prev_end = end

with open(output, 'wb') as f:
from .transcribe_middleware import TranscribeMiddleware
TranscribeMiddleware(self.args, subs).run()

with open(output, mode='wb') as f:
f.write(srt.compose(subs).encode(self.args.encoding, 'replace'))

def _save_md(self, md_fn, srt_fn, video_fn):
Expand All @@ -117,11 +120,11 @@ def _save_md(self, md_fn, srt_fn, video_fn):
md.add_done_edditing(False)
md.add_video(os.path.basename(video_fn))
md.add(f'\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)}).'
'Mark the sentences to keep for autocut.\n'
'The format is [subtitle_index,duration_in_second] subtitle context.\n\n')
'Mark the sentences to keep for autocut.\n'
'The format is [subtitle_index,duration_in_second] subtitle context.\n\n')

for s in subs:
sec = s.start.seconds
pre = f'[{s.index},{sec//60:02d}:{sec % 60:02d}]'
pre = f'[{s.index},{sec // 60:02d}:{sec % 60:02d}]'
md.add_task(False, f'{pre:11} {s.content.strip()}')
md.write()
69 changes: 69 additions & 0 deletions autocut/transcribe_middleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import srt


class TranscribeMiddleware:
def __init__(self, args, subs: list[srt.Subtitle]):
self.args = args
self.subs = subs
self.SINGLE_SUB_MAX_LEN = 16

def run(self):
if self.args.lang == "zh":
if self.args.sub_optimize_cn:
self._sub_split_CN()

if len(self.args.modal_words_cn.strip()) > 0:
self._sub_filter_modal_CN()

def _sub_split_CN(self):
import datetime

new_subs = []

for sub in self.subs:
duration = (sub.end - sub.start).total_seconds()

# for my opinion, the sub don't need any mood punctuation mark
sub_content_temp = sub.content.strip() \
.replace(",", ",").replace("。", ",").replace("!", ",").replace("?", ",")
endcloud marked this conversation as resolved.
Show resolved Hide resolved
sub_split_list = sub_content_temp.split(",")
sub_len = len(sub_content_temp) - sub_content_temp.count(",")

# Sliding Window to control single sentence length
interval_start = sub.start.total_seconds()
interval_end = sub.start.total_seconds()
interval_len = 0
start_index = 0
for index, sub_split in enumerate(sub_split_list):
interval_end = interval_end + (len(sub_split) / sub_len) * duration
interval_len = interval_len + len(sub_split) + 1

if interval_len < self.SINGLE_SUB_MAX_LEN + 1:
continue

new_subs.append(srt.Subtitle(index=0,
start=datetime.timedelta(seconds=interval_start),
end=datetime.timedelta(seconds=interval_end),
content=sub_split if index == start_index
else ",".join(sub_split_list[start_index:index + 1])))

interval_start = interval_end
start_index = index + 1
interval_len = 0

if interval_len != 0:
new_subs.append(srt.Subtitle(index=0,
start=datetime.timedelta(seconds=interval_start),
end=datetime.timedelta(seconds=interval_end),
content=sub.content.strip() if start_index == 0
else ",".join(sub_split_list[start_index:])))

self.subs.clear()
self.subs.extend(new_subs)

def _sub_filter_modal_CN(self):
key_list = [key.strip() for key in self.args.modal_words_cn.split(",")]
for sub in self.subs:
for char in sub.content.strip():
if char in key_list:
sub.content = sub.content.replace(char, "")