Skip to content

Commit

Permalink
Add support for src language auto-detection when not input -SRC lan…
Browse files Browse the repository at this point in the history
…guage
  • Loading branch information
BingLingGroup committed May 5, 2020
1 parent d84ecab commit f48e445
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 151 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [Added](#addedunreleased)
- [Changed](#changedunreleased)
- [Fixed](#fixedunreleased)
- [Removed](#removedunreleased)
- [0.5.6-alpha - 2020-03-20](#056-alpha---2020-03-20)
- [Added](#added056-alpha)
- [Changed](#changed056-alpha)
Expand Down Expand Up @@ -58,6 +59,7 @@ Click up arrow to go back to TOC.
- Add stop words to split events in merge_src_assfile method.
- Add punctuations split in merge_src_assfile method.
- Add limitation in SplitIntoAudioPiece with an audio length of at least 4 bytes.
- Add support for src language auto-detection when not input `-SRC` language.

#### Changed(Unreleased)

Expand Down
42 changes: 14 additions & 28 deletions autosub/cmdline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,15 +99,15 @@ def list_args(args):
print("{column_1}{column_2}".format(
column_1=lang_code_utils.wjust(_("Lang code"), 18),
column_2=_("Description")))
for code, language in sorted(constants.TRANSLATION_LANGUAGE_CODES.items()):
for code, language in sorted(googletrans.constants.LANGUAGES.items()):
print("{column_1}{column_2}".format(
column_1=lang_code_utils.wjust(code, 18),
column_2=language))
else:
print(_("Match py-googletrans lang codes."))
lang_code_utils.match_print(
dsr_lang=args.list_translation_codes,
match_list=list(constants.TRANSLATION_LANGUAGE_CODES.keys()),
match_list=list(googletrans.constants.LANGUAGES.keys()),
min_score=args.min_score)
return True

Expand Down Expand Up @@ -423,18 +423,11 @@ def validate_aovp_args(args): # pylint: disable=too-many-branches, too-many-ret
elif 'src' not in args.best_match:
args.best_match.add('src')

is_src_matched = False
is_dst_matched = False
args.src_language = args.src_language.lower()
args.dst_language = args.dst_language.lower()

for key in googletrans.constants.LANGUAGES:
if args.src_language.lower() == key.lower():
args.src_language = key
is_src_matched = True
if args.dst_language.lower() == key.lower():
args.dst_language = key
is_dst_matched = True

if not is_src_matched:
if args.src_language != 'auto' and \
args.src_language not in googletrans.constants.LANGUAGES:
if args.best_match and 'src' in args.best_match:
print(_("Let translation source lang code "
"to match py-googletrans lang codes."))
Expand All @@ -456,7 +449,7 @@ def validate_aovp_args(args): # pylint: disable=too-many-branches, too-many-ret
"Or use \"-bm\"/\"--best-match\" to get a best match.").format(
src=args.src_language))

if not is_dst_matched:
if args.dst_language not in googletrans.constants.LANGUAGES:
if args.best_match and 'd' in args.best_match:
print(_("Let translation destination lang code "
"to match py-googletrans lang codes."))
Expand Down Expand Up @@ -516,18 +509,11 @@ def validate_sp_args(args): # pylint: disable=too-many-branches,too-many-return
raise exceptions.AutosubException(
_("Error: Destination language not provided."))

is_src_matched = False
is_dst_matched = False

for key in googletrans.constants.LANGUAGES:
if args.src_language.lower() == key.lower():
args.src_language = key
is_src_matched = True
if args.dst_language.lower() == key.lower():
args.dst_language = key
is_dst_matched = True
args.src_language = args.src_language.lower()
args.dst_language = args.dst_language.lower()

if not is_src_matched:
if args.src_language != 'auto' and\
args.src_language not in googletrans.constants.LANGUAGES:
if args.best_match and 'src' in args.best_match:
print(
_("Warning: Source language \"{src}\" not supported. "
Expand All @@ -554,7 +540,7 @@ def validate_sp_args(args): # pylint: disable=too-many-branches,too-many-return
"Or use \"-bm\"/\"--best-match\" to get a best match.").format(
src=args.src_language))

if not is_dst_matched:
if args.dst_language not in googletrans.constants.LANGUAGES:
if args.best_match and 'd' in args.best_match:
print(
_("Warning: Destination language \"{dst}\" not supported. "
Expand Down Expand Up @@ -800,7 +786,7 @@ def sub_trans( # pylint: disable=too-many-branches, too-many-statements, too-ma

# text translation
# use googletrans
translated_text = core.list_to_googletrans(
translated_text, args.src_language = core.list_to_googletrans(
text_list,
src_language=args.src_language,
dst_language=args.dst_language,
Expand Down Expand Up @@ -1365,7 +1351,7 @@ def audio_or_video_prcs( # pylint: disable=too-many-branches, too-many-statemen
pass

# text translation
translated_text = core.list_to_googletrans(
translated_text, args.src_language = core.list_to_googletrans(
text_list,
src_language=args.src_language,
dst_language=args.dst_language,
Expand Down
111 changes: 1 addition & 110 deletions autosub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,119 +210,10 @@
'ur-in': 'Urdu (India)',
'ur-pk': 'Urdu (Pakistan)',
'vi-vn': 'Vietnamese (Vietnam)',
'yue-hant-hk' : 'Chinese, Cantonese (Traditional, Hong Kong)',
'yue-hant-hk': 'Chinese, Cantonese (Traditional, Hong Kong)',
'zu-za': 'Zulu (South Africa)'
}

TRANSLATION_LANGUAGE_CODES = {
'af': 'Afrikaans',
'am': 'Amharic',
'ar': 'Arabic',
'az': 'Azerbaijani',
'be': 'Belarusian',
'bg': 'Bulgarian',
'bn': 'Bengali',
'bs': 'Bosnian',
'ca': 'Catalan',
'ceb': 'Cebuano',
'co': 'Corsican',
'cs': 'Czech',
'cy': 'Welsh',
'da': 'Danish',
'de': 'German',
'el': 'Greek',
'en': 'English',
'eo': 'Esperanto',
'es': 'Spanish',
'et': 'Estonian',
'eu': 'Basque',
'fa': 'Persian',
'fi': 'Finnish',
'fr': 'French',
'fy': 'Frisian',
'ga': 'Irish',
'gd': 'Scots Gaelic',
'gl': 'Galician',
'gu': 'Gujarati',
'ha': 'Hausa',
'haw': 'Hawaiian',
'he': 'Hebrew',
'hi': 'Hindi',
'hmn': 'Hmong',
'hr': 'Croatian',
'ht': 'Haitian Creole',
'hu': 'Hungarian',
'hy': 'Armenian',
'id': 'Indonesian',
'ig': 'Igbo',
'is': 'Icelandic',
'it': 'Italian',
'iw': 'Hebrew',
'ja': 'Japanese',
'jw': 'Javanese',
'ka': 'Georgian',
'kk': 'Kazakh',
'km': 'Khmer',
'kn': 'Kannada',
'ko': 'Korean',
'ku': 'Kurdish',
'ky': 'Kyrgyz',
'la': 'Latin',
'lb': 'Luxembourgish',
'lo': 'Lao',
'lt': 'Lithuanian',
'lv': 'Latvian',
'mg': 'Malagasy',
'mi': 'Maori',
'mk': 'Macedonian',
'ml': 'Malayalam',
'mn': 'Mongolian',
'mr': 'Marathi',
'ms': 'Malay',
'mt': 'Maltese',
'my': 'Myanmar(Burmese)',
'ne': 'Nepali',
'nl': 'Dutch',
'no': 'Norwegian',
'ny': 'Nyanja(Chichewa)',
'pa': 'Punjabi',
'pl': 'Polish',
'ps': 'Pashto',
'pt': 'Portuguese(Portugal,Brazil)',
'ro': 'Romanian',
'ru': 'Russian',
'sd': 'Sindhi',
'si': 'Sinhala(Sinhalese)',
'sk': 'Slovak',
'sl': 'Slovenian',
'sm': 'Samoan',
'sn': 'Shona',
'so': 'Somali',
'sq': 'Albanian',
'sr': 'Serbian',
'st': 'Sesotho',
'su': 'Sundanese',
'sv': 'Swedish',
'sw': 'Swahili',
'ta': 'Tamil',
'te': 'Telugu',
'tg': 'Tajik',
'th': 'Thai',
'tl': 'Tagalog(Filipino)',
'tr': 'Turkish',
'uk': 'Ukrainian',
'ur': 'Urdu',
'uz': 'Uzbek',
'vi': 'Vietnamese',
'xh': 'Xhosa',
'yi': 'Yiddish',
'yo': 'Yoruba',
'zh': 'Chinese (Simplified)',
'zh-cn': 'Chinese (Simplified)',
'zh-tw': 'Chinese (Traditional)',
'zu': 'Zulu'
}

OUTPUT_FORMAT = {
'srt': 'SubRip',
'ass': 'Advanced SubStation Alpha',
Expand Down
22 changes: 15 additions & 7 deletions autosub/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,9 +575,9 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments,
if not text_list:
return None

print(_("\nTranslating text from \"{0}\" to \"{1}\".").format(
src_language,
dst_language))
translator = googletrans.Translator(
user_agent=user_agent,
service_urls=service_urls)

size = 0
i = 0
Expand Down Expand Up @@ -630,6 +630,16 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments,
valid_index.append(i)
# valid_index for valid text position end

if src_language == "auto":
content_to_trans = '\n'.join(text_list[i:partial_index[0]])
result_src = translator.detect(content_to_trans).lang
else:
result_src = src_language

print(_("\nTranslating text from \"{0}\" to \"{1}\".").format(
result_src,
dst_language))

widgets = [_("Translation: "),
progressbar.Percentage(), ' ',
progressbar.Bar(), ' ',
Expand All @@ -642,9 +652,6 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments,
# total position
j = 0
# valid_index position
translator = googletrans.Translator(
user_agent=user_agent,
service_urls=service_urls)

for index in partial_index:
content_to_trans = '\n'.join(text_list[i:index])
Expand All @@ -654,6 +661,7 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments,
dest=dst_language,
src=src_language)
result_text = translation.text.translate(str.maketrans('’', '\''))
result_src = translation.src
result_list = result_text.split('\n')
k = 0
len_result_list = len(result_list)
Expand Down Expand Up @@ -702,7 +710,7 @@ def list_to_googletrans( # pylint: disable=too-many-locals, too-many-arguments,
print(_("Cancelling translation."))
return 1

return translated_text
return translated_text, result_src


def list_to_sub_str(
Expand Down
9 changes: 3 additions & 6 deletions autosub/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,18 +150,15 @@ def get_cmd_parser(): # pylint: disable=too-many-statements
lang_group.add_argument(
'-SRC', '--src-language',
metavar=_('lang_code'),
default='auto',
help=_("Lang code/Lang tag for translation source language. "
"If not given, use langcodes to get a best matching "
"of the \"-S\"/\"--speech-language\". "
"If using py-googletrans as the method to translate, "
"WRONG INPUT STOP RUNNING. "
"If not given, use py-googletrans to auto-detect the src language. "
"(arg_num = 1) (default: %(default)s)"))

lang_group.add_argument(
'-D', '--dst-language',
metavar=_('lang_code'),
help=_("Lang code/Lang tag for translation destination language. "
"Same attention in the \"-SRC\"/\"--src-language\". "
"(arg_num = 1) (default: %(default)s)"))

lang_group.add_argument(
Expand All @@ -170,7 +167,7 @@ def get_cmd_parser(): # pylint: disable=too-many-statements
nargs="*",
help=_("Allow langcodes to get a best matching lang code "
"when your input is wrong. "
"Only functional for py-googletrans and Google Speech V2. "
"Only functional for py-googletrans and Google Speech API. "
"Available modes: "
"s, src, d, all. "
"\"s\" for \"-S\"/\"--speech-language\". "
Expand Down
1 change: 1 addition & 0 deletions docs/CHANGELOG.zh-Hans.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
- 添加停用词用于merge_src_assfile方法里的断句。
- 添加标点符号分割功能在merge_src_assfile方法里。
- 添加音频长度至少为4字节的检测,在SplitIntoAudioPiece里。
- 添加源语言自动识别功能,当不输入`-SRC`选项时。

#### 改动(未发布)

Expand Down

0 comments on commit f48e445

Please sign in to comment.