mli · endcloud · Nov 10, 2022 · Nov 11, 2022 · Nov 11, 2022 · Nov 11, 2022
diff --git a/autocut/main.py b/autocut/main.py
@@ -6,37 +6,41 @@
 
 def main():
     parser = argparse.ArgumentParser(description='Edit videos based on transcribed subtitles',
-        formatter_class=argparse.RawDescriptionHelpFormatter)
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
 
     logging.basicConfig(format='[autocut:%(filename)s:L%(lineno)d] %(levelname)-6s %(message)s')
     logging.getLogger().setLevel(logging.INFO)
 
     parser.add_argument('inputs', type=str, nargs='+',
                         help='Inputs filenames/folders')
     parser.add_argument('-t', '--transcribe', help='Transcribe videos/audio into subtitles',
-        action=argparse.BooleanOptionalAction)
+                        action=argparse.BooleanOptionalAction)
     parser.add_argument('-c', '--cut', help='Cut a video based on subtitles',
-        action=argparse.BooleanOptionalAction)
+                        action=argparse.BooleanOptionalAction)
     parser.add_argument('-d', '--daemon', help='Monitor a folder to trascribe and cut',
-        action=argparse.BooleanOptionalAction)
+                        action=argparse.BooleanOptionalAction)
     parser.add_argument('-s', help='Convert .srt to a compact format for easier editting',
-        action=argparse.BooleanOptionalAction)
+                        action=argparse.BooleanOptionalAction)
     parser.add_argument('--lang', type=str, default='zh',
-        choices=['zh', 'en'],
-        help='The output language of transcription')
-    parser.add_argument('--prompt', type=str, default='',
-        help='initial prompt feed into whisper')
+                        choices=['zh', 'en'],
+                        help='The output language of transcription')
+    parser.add_argument('--prompt', type=str, default='大家好，',
+                        help='initial prompt feed into whisper')
     parser.add_argument('--whisper-model', type=str, default='small',
-        choices=['tiny', 'base', 'small', 'medium', 'large'],
-        help='The whisper model used to transcribe.')
+                        choices=['tiny', 'base', 'small', 'medium', 'large'],
+                        help='The whisper model used to transcribe.')
     parser.add_argument('--bitrate', type=str, default='10m',
-        help='The bitrate to export the cutted video, such as 10m, 1m, or 500k')
+                        help='The bitrate to export the cutted video, such as 10m, 1m, or 500k')
     parser.add_argument('--vad', help='If or not use VAD',
-        action=argparse.BooleanOptionalAction)
+                        action=argparse.BooleanOptionalAction)
     parser.add_argument('--force', help='Force write even if files exist',
-        action=argparse.BooleanOptionalAction)
+                        action=argparse.BooleanOptionalAction)
     parser.add_argument('--encoding', type=str, default='utf-8',
-        help='Document encoding format')
+                        help='Document encoding format')
+    parser.add_argument('--sub-optimize-cn', help='Optimize the display of long sentences in subtitle for Chinese',
+                        action=argparse.BooleanOptionalAction)
+    parser.add_argument('--modal-words-cn', type=str, default="啊,嗯,呢,呐,吧",  # use English comma to separate
+                        help='To filter the modal words in sentences for Chinese')
 
     args = parser.parse_args()
 
@@ -54,5 +58,6 @@ def main():
     else:
         logging.warn('No action, use -c, -t or -d')
 
+
 if __name__ == "__main__":
     main()
diff --git a/autocut/transcribe.py b/autocut/transcribe.py
@@ -23,23 +23,23 @@ def run(self):
         for input in self.args.inputs:
             logging.info(f'Transcribing {input}')
             name, _ = os.path.splitext(input)
-            if utils.check_exists(name+'.md', self.args.force):
+            if utils.check_exists(name + '.md', self.args.force):
                 continue
 
             audio = whisper.load_audio(input, sr=self.sampling_rate)
             speech_timestamps = self._detect_voice_activity(audio)
-            transcribe_results = self._transcibe(audio, speech_timestamps)
+            transcribe_results = self._transcribe(audio, speech_timestamps)
 
-            output = name +'.srt'
+            output = name + '.srt'
             self._save_srt(output, transcribe_results)
             logging.info(f'Transcribed {input} to {output}')
-            self._save_md(name+'.md', output, input)
-            logging.info(f'Saved texts to {name+".md"} to mark sentences')
+            self._save_md(name + '.md', output, input)
+            logging.info(f'Saved texts to {name + ".md"} to mark sentences')
 
     def _detect_voice_activity(self, audio):
         """Detect segments that have voice activities"""
         if not self.args.vad:
-            return [{'start': 0, 'end':len(audio)}]
+            return [{'start': 0, 'end': len(audio)}]
         tic = time.time()
         if self.vad_model is None or self.detect_speech is None:
             self.vad_model, funcs = torch.hub.load(
@@ -50,7 +50,7 @@ def _detect_voice_activity(self, audio):
             self.detect_speech = funcs[0]
 
         speeches = self.detect_speech(audio, self.vad_model,
-            sampling_rate=self.sampling_rate)
+                                      sampling_rate=self.sampling_rate)
 
         # Merge very closed segments
         # speeches = _merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)
@@ -59,13 +59,13 @@ def _detect_voice_activity(self, audio):
         speeches = utils.remove_short_segments(speeches, 10.0 * self.sampling_rate)
 
         # Expand to avoid to tight cut. You can tune the pad length
-        speeches =  utils.expand_segments(speeches, 0.2*self.sampling_rate,
-            0.0*self.sampling_rate, audio.shape[0])
+        speeches = utils.expand_segments(speeches, 0.2 * self.sampling_rate,
+                                         0.0 * self.sampling_rate, audio.shape[0])
 
-        logging.info(f'Done voice activity detetion in {time.time()-tic:.1f} sec')
+        logging.info(f'Done voice activity detetion in {time.time() - tic:.1f} sec')
         return speeches
 
-    def _transcibe(self, audio, speech_timestamps):
+    def _transcribe(self, audio, speech_timestamps):
         tic = time.time()
         if self.whisper_model is None:
             self.whisper_model = whisper.load_model(self.args.whisper_model)
@@ -74,11 +74,11 @@ def _transcibe(self, audio, speech_timestamps):
         # TODO, a better way is merging these segments into a single one, so whisper can get more context
         for seg in speech_timestamps:
             r = self.whisper_model.transcribe(
-                    audio[int(seg['start']):int(seg['end'])],
-                    task='transcribe', language=self.args.lang, initial_prompt=self.args.prompt)
+                audio[int(seg['start']):int(seg['end'])],
+                task='transcribe', language=self.args.lang, initial_prompt=self.args.prompt)
             r['origin_timestamp'] = seg
             res.append(r)
-        logging.info(f'Done transcription in {time.time()-tic:.1f} sec')
+        logging.info(f'Done transcription in {time.time() - tic:.1f} sec')
         return res
 
     def _save_srt(self, output, transcribe_results):
@@ -88,9 +88,9 @@ def _save_srt(self, output, transcribe_results):
 
         def _add_sub(start, end, text):
             subs.append(srt.Subtitle(index=0,
-                start=datetime.timedelta(seconds=start),
-                end=datetime.timedelta(seconds=end),
-                content=cc.convert(text.strip())))
+                                     start=datetime.timedelta(seconds=start),
+                                     end=datetime.timedelta(seconds=end),
+                                     content=cc.convert(text.strip())))
 
         prev_end = 0
         for r in transcribe_results:
@@ -106,7 +106,10 @@ def _add_sub(start, end, text):
                 _add_sub(start, end, s["text"])
                 prev_end = end
 
-        with open(output, 'wb') as f:
+        from .transcribe_middleware import TranscribeMiddleware
+        TranscribeMiddleware(self.args, subs).run()
+
+        with open(output, mode='wb') as f:
             f.write(srt.compose(subs).encode(self.args.encoding, 'replace'))
 
     def _save_md(self, md_fn, srt_fn, video_fn):
@@ -117,11 +120,11 @@ def _save_md(self, md_fn, srt_fn, video_fn):
         md.add_done_edditing(False)
         md.add_video(os.path.basename(video_fn))
         md.add(f'\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)}).'
-        'Mark the sentences to keep for autocut.\n'
-        'The format is [subtitle_index,duration_in_second] subtitle context.\n\n')
+               'Mark the sentences to keep for autocut.\n'
+               'The format is [subtitle_index,duration_in_second] subtitle context.\n\n')
 
         for s in subs:
             sec = s.start.seconds
-            pre = f'[{s.index},{sec//60:02d}:{sec % 60:02d}]'
+            pre = f'[{s.index},{sec // 60:02d}:{sec % 60:02d}]'
             md.add_task(False, f'{pre:11} {s.content.strip()}')
         md.write()
diff --git a/autocut/transcribe_middleware.py b/autocut/transcribe_middleware.py
@@ -0,0 +1,69 @@
+import srt
+
+
+class TranscribeMiddleware:
+    def __init__(self, args, subs: list[srt.Subtitle]):
+        self.args = args
+        self.subs = subs
+        self.SINGLE_SUB_MAX_LEN = 16
+
+    def run(self):
+        if self.args.lang == "zh":
+            if self.args.sub_optimize_cn:
+                self._sub_split_CN()
+
+            if len(self.args.modal_words_cn.strip()) > 0:
+                self._sub_filter_modal_CN()
+
+    def _sub_split_CN(self):
+        import datetime
+
+        new_subs = []
+
+        for sub in self.subs:
+            duration = (sub.end - sub.start).total_seconds()
+
+            # for my opinion, the sub don't need any mood punctuation mark
+            sub_content_temp = sub.content.strip() \
+                .replace(",", "，").replace("。", "，").replace("！", "，").replace("？", "，")
+            sub_split_list = sub_content_temp.split("，")
+            sub_len = len(sub_content_temp) - sub_content_temp.count("，")
+
+            # Sliding Window to control single sentence length
+            interval_start = sub.start.total_seconds()
+            interval_end = sub.start.total_seconds()
+            interval_len = 0
+            start_index = 0
+            for index, sub_split in enumerate(sub_split_list):
+                interval_end = interval_end + (len(sub_split) / sub_len) * duration
+                interval_len = interval_len + len(sub_split) + 1
+
+                if interval_len < self.SINGLE_SUB_MAX_LEN + 1:
+                    continue
+
+                new_subs.append(srt.Subtitle(index=0,
+                                             start=datetime.timedelta(seconds=interval_start),
+                                             end=datetime.timedelta(seconds=interval_end),
+                                             content=sub_split if index == start_index
+                                             else "，".join(sub_split_list[start_index:index + 1])))
+
+                interval_start = interval_end
+                start_index = index + 1
+                interval_len = 0
+
+            if interval_len != 0:
+                new_subs.append(srt.Subtitle(index=0,
+                                             start=datetime.timedelta(seconds=interval_start),
+                                             end=datetime.timedelta(seconds=interval_end),
+                                             content=sub.content.strip() if start_index == 0
+                                             else "，".join(sub_split_list[start_index:])))
+
+        self.subs.clear()
+        self.subs.extend(new_subs)
+
+    def _sub_filter_modal_CN(self):
+        key_list = [key.strip() for key in self.args.modal_words_cn.split(",")]
+        for sub in self.subs:
+            for char in sub.content.strip():
+                if char in key_list:
+                    sub.content = sub.content.replace(char, "")