From f48e445782eab9a7e4ee3440e95fcfc777b7a5e7 Mon Sep 17 00:00:00 2001 From: BingLingGroup <42505588+BingLingGroup@users.noreply.github.com> Date: Tue, 5 May 2020 17:37:34 +0800 Subject: [PATCH] Add support for src language auto-detection when not input `-SRC` language --- CHANGELOG.md | 2 + autosub/cmdline_utils.py | 42 +++++---------- autosub/constants.py | 111 +------------------------------------- autosub/core.py | 22 +++++--- autosub/options.py | 9 ++-- docs/CHANGELOG.zh-Hans.md | 1 + 6 files changed, 36 insertions(+), 151 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ccec4013..9a4afc23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [Added](#addedunreleased) - [Changed](#changedunreleased) - [Fixed](#fixedunreleased) + - [Removed](#removedunreleased) - [0.5.6-alpha - 2020-03-20](#056-alpha---2020-03-20) - [Added](#added056-alpha) - [Changed](#changed056-alpha) @@ -58,6 +59,7 @@ Click up arrow to go back to TOC. - Add stop words to split events in merge_src_assfile method. - Add punctuations split in merge_src_assfile method. - Add limitation in SplitIntoAudioPiece with an audio length of at least 4 bytes. +- Add support for src language auto-detection when not input `-SRC` language. #### Changed(Unreleased) diff --git a/autosub/cmdline_utils.py b/autosub/cmdline_utils.py index 81bb8ebc..3bc45127 100644 --- a/autosub/cmdline_utils.py +++ b/autosub/cmdline_utils.py @@ -99,7 +99,7 @@ def list_args(args): print("{column_1}{column_2}".format( column_1=lang_code_utils.wjust(_("Lang code"), 18), column_2=_("Description"))) - for code, language in sorted(constants.TRANSLATION_LANGUAGE_CODES.items()): + for code, language in sorted(googletrans.constants.LANGUAGES.items()): print("{column_1}{column_2}".format( column_1=lang_code_utils.wjust(code, 18), column_2=language)) @@ -107,7 +107,7 @@ def list_args(args): print(_("Match py-googletrans lang codes.")) lang_code_utils.match_print( dsr_lang=args.list_translation_codes, - match_list=list(constants.TRANSLATION_LANGUAGE_CODES.keys()), + match_list=list(googletrans.constants.LANGUAGES.keys()), min_score=args.min_score) return True @@ -423,18 +423,11 @@ def validate_aovp_args(args): # pylint: disable=too-many-branches, too-many-ret elif 'src' not in args.best_match: args.best_match.add('src') - is_src_matched = False - is_dst_matched = False + args.src_language = args.src_language.lower() + args.dst_language = args.dst_language.lower() - for key in googletrans.constants.LANGUAGES: - if args.src_language.lower() == key.lower(): - args.src_language = key - is_src_matched = True - if args.dst_language.lower() == key.lower(): - args.dst_language = key - is_dst_matched = True - - if not is_src_matched: + if args.src_language != 'auto' and \ + args.src_language not in googletrans.constants.LANGUAGES: if args.best_match and 'src' in args.best_match: print(_("Let translation source lang code " "to match py-googletrans lang codes.")) @@ -456,7 +449,7 @@ def validate_aovp_args(args): # pylint: disable=too-many-branches, too-many-ret "Or use \"-bm\"/\"--best-match\" to get a best match.").format( src=args.src_language)) - if not is_dst_matched: + if args.dst_language not in googletrans.constants.LANGUAGES: if args.best_match and 'd' in args.best_match: print(_("Let translation destination lang code " "to match py-googletrans lang codes.")) @@ -516,18 +509,11 @@ def validate_sp_args(args): # pylint: disable=too-many-branches,too-many-return raise exceptions.AutosubException( _("Error: Destination language not provided.")) - is_src_matched = False - is_dst_matched = False - - for key in googletrans.constants.LANGUAGES: - if args.src_language.lower() == key.lower(): - args.src_language = key - is_src_matched = True - if args.dst_language.lower() == key.lower(): - args.dst_language = key - is_dst_matched = True + args.src_language = args.src_language.lower() + args.dst_language = args.dst_language.lower() - if not is_src_matched: + if args.src_language != 'auto' and\ + args.src_language not in googletrans.constants.LANGUAGES: if args.best_match and 'src' in args.best_match: print( _("Warning: Source language \"{src}\" not supported. " @@ -554,7 +540,7 @@ def validate_sp_args(args): # pylint: disable=too-many-branches,too-many-return "Or use \"-bm\"/\"--best-match\" to get a best match.").format( src=args.src_language)) - if not is_dst_matched: + if args.dst_language not in googletrans.constants.LANGUAGES: if args.best_match and 'd' in args.best_match: print( _("Warning: Destination language \"{dst}\" not supported. " @@ -800,7 +786,7 @@ def sub_trans( # pylint: disable=too-many-branches, too-many-statements, too-ma # text translation # use googletrans - translated_text = core.list_to_googletrans( + translated_text, args.src_language = core.list_to_googletrans( text_list, src_language=args.src_language, dst_language=args.dst_language, @@ -1365,7 +1351,7 @@ def audio_or_video_prcs( # pylint: disable=too-many-branches, too-many-statemen pass # text translation - translated_text = core.list_to_googletrans( + translated_text, args.src_language = core.list_to_googletrans( text_list, src_language=args.src_language, dst_language=args.dst_language, diff --git a/autosub/constants.py b/autosub/constants.py index 278f695d..87aff84a 100644 --- a/autosub/constants.py +++ b/autosub/constants.py @@ -210,119 +210,10 @@ 'ur-in': 'Urdu (India)', 'ur-pk': 'Urdu (Pakistan)', 'vi-vn': 'Vietnamese (Vietnam)', - 'yue-hant-hk' : 'Chinese, Cantonese (Traditional, Hong Kong)', + 'yue-hant-hk': 'Chinese, Cantonese (Traditional, Hong Kong)', 'zu-za': 'Zulu (South Africa)' } -TRANSLATION_LANGUAGE_CODES = { - 'af': 'Afrikaans', - 'am': 'Amharic', - 'ar': 'Arabic', - 'az': 'Azerbaijani', - 'be': 'Belarusian', - 'bg': 'Bulgarian', - 'bn': 'Bengali', - 'bs': 'Bosnian', - 'ca': 'Catalan', - 'ceb': 'Cebuano', - 'co': 'Corsican', - 'cs': 'Czech', - 'cy': 'Welsh', - 'da': 'Danish', - 'de': 'German', - 'el': 'Greek', - 'en': 'English', - 'eo': 'Esperanto', - 'es': 'Spanish', - 'et': 'Estonian', - 'eu': 'Basque', - 'fa': 'Persian', - 'fi': 'Finnish', - 'fr': 'French', - 'fy': 'Frisian', - 'ga': 'Irish', - 'gd': 'Scots Gaelic', - 'gl': 'Galician', - 'gu': 'Gujarati', - 'ha': 'Hausa', - 'haw': 'Hawaiian', - 'he': 'Hebrew', - 'hi': 'Hindi', - 'hmn': 'Hmong', - 'hr': 'Croatian', - 'ht': 'Haitian Creole', - 'hu': 'Hungarian', - 'hy': 'Armenian', - 'id': 'Indonesian', - 'ig': 'Igbo', - 'is': 'Icelandic', - 'it': 'Italian', - 'iw': 'Hebrew', - 'ja': 'Japanese', - 'jw': 'Javanese', - 'ka': 'Georgian', - 'kk': 'Kazakh', - 'km': 'Khmer', - 'kn': 'Kannada', - 'ko': 'Korean', - 'ku': 'Kurdish', - 'ky': 'Kyrgyz', - 'la': 'Latin', - 'lb': 'Luxembourgish', - 'lo': 'Lao', - 'lt': 'Lithuanian', - 'lv': 'Latvian', - 'mg': 'Malagasy', - 'mi': 'Maori', - 'mk': 'Macedonian', - 'ml': 'Malayalam', - 'mn': 'Mongolian', - 'mr': 'Marathi', - 'ms': 'Malay', - 'mt': 'Maltese', - 'my': 'Myanmar(Burmese)', - 'ne': 'Nepali', - 'nl': 'Dutch', - 'no': 'Norwegian', - 'ny': 'Nyanja(Chichewa)', - 'pa': 'Punjabi', - 'pl': 'Polish', - 'ps': 'Pashto', - 'pt': 'Portuguese(Portugal,Brazil)', - 'ro': 'Romanian', - 'ru': 'Russian', - 'sd': 'Sindhi', - 'si': 'Sinhala(Sinhalese)', - 'sk': 'Slovak', - 'sl': 'Slovenian', - 'sm': 'Samoan', - 'sn': 'Shona', - 'so': 'Somali', - 'sq': 'Albanian', - 'sr': 'Serbian', - 'st': 'Sesotho', - 'su': 'Sundanese', - 'sv': 'Swedish', - 'sw': 'Swahili', - 'ta': 'Tamil', - 'te': 'Telugu', - 'tg': 'Tajik', - 'th': 'Thai', - 'tl': 'Tagalog(Filipino)', - 'tr': 'Turkish', - 'uk': 'Ukrainian', - 'ur': 'Urdu', - 'uz': 'Uzbek', - 'vi': 'Vietnamese', - 'xh': 'Xhosa', - 'yi': 'Yiddish', - 'yo': 'Yoruba', - 'zh': 'Chinese (Simplified)', - 'zh-cn': 'Chinese (Simplified)', - 'zh-tw': 'Chinese (Traditional)', - 'zu': 'Zulu' -} - OUTPUT_FORMAT = { 'srt': 'SubRip', 'ass': 'Advanced SubStation Alpha', diff --git a/autosub/core.py b/autosub/core.py index 8a8713c2..cf518539 100644 --- a/autosub/core.py +++ b/autosub/core.py @@ -575,9 +575,9 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments, if not text_list: return None - print(_("\nTranslating text from \"{0}\" to \"{1}\".").format( - src_language, - dst_language)) + translator = googletrans.Translator( + user_agent=user_agent, + service_urls=service_urls) size = 0 i = 0 @@ -630,6 +630,16 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments, valid_index.append(i) # valid_index for valid text position end + if src_language == "auto": + content_to_trans = '\n'.join(text_list[i:partial_index[0]]) + result_src = translator.detect(content_to_trans).lang + else: + result_src = src_language + + print(_("\nTranslating text from \"{0}\" to \"{1}\".").format( + result_src, + dst_language)) + widgets = [_("Translation: "), progressbar.Percentage(), ' ', progressbar.Bar(), ' ', @@ -642,9 +652,6 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments, # total position j = 0 # valid_index position - translator = googletrans.Translator( - user_agent=user_agent, - service_urls=service_urls) for index in partial_index: content_to_trans = '\n'.join(text_list[i:index]) @@ -654,6 +661,7 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments, dest=dst_language, src=src_language) result_text = translation.text.translate(str.maketrans('’', '\'')) + result_src = translation.src result_list = result_text.split('\n') k = 0 len_result_list = len(result_list) @@ -702,7 +710,7 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments, print(_("Cancelling translation.")) return 1 - return translated_text + return translated_text, result_src def list_to_sub_str( diff --git a/autosub/options.py b/autosub/options.py index 26b77488..708e0747 100644 --- a/autosub/options.py +++ b/autosub/options.py @@ -150,18 +150,15 @@ def get_cmd_parser(): # pylint: disable=too-many-statements lang_group.add_argument( '-SRC', '--src-language', metavar=_('lang_code'), + default='auto', help=_("Lang code/Lang tag for translation source language. " - "If not given, use langcodes to get a best matching " - "of the \"-S\"/\"--speech-language\". " - "If using py-googletrans as the method to translate, " - "WRONG INPUT STOP RUNNING. " + "If not given, use py-googletrans to auto-detect the src language. " "(arg_num = 1) (default: %(default)s)")) lang_group.add_argument( '-D', '--dst-language', metavar=_('lang_code'), help=_("Lang code/Lang tag for translation destination language. " - "Same attention in the \"-SRC\"/\"--src-language\". " "(arg_num = 1) (default: %(default)s)")) lang_group.add_argument( @@ -170,7 +167,7 @@ def get_cmd_parser(): # pylint: disable=too-many-statements nargs="*", help=_("Allow langcodes to get a best matching lang code " "when your input is wrong. " - "Only functional for py-googletrans and Google Speech V2. " + "Only functional for py-googletrans and Google Speech API. " "Available modes: " "s, src, d, all. " "\"s\" for \"-S\"/\"--speech-language\". " diff --git a/docs/CHANGELOG.zh-Hans.md b/docs/CHANGELOG.zh-Hans.md index 34892b30..aeaa65e9 100644 --- a/docs/CHANGELOG.zh-Hans.md +++ b/docs/CHANGELOG.zh-Hans.md @@ -56,6 +56,7 @@ - 添加停用词用于merge_src_assfile方法里的断句。 - 添加标点符号分割功能在merge_src_assfile方法里。 - 添加音频长度至少为4字节的检测,在SplitIntoAudioPiece里。 +- 添加源语言自动识别功能,当不输入`-SRC`选项时。 #### 改动(未发布)