-
Notifications
You must be signed in to change notification settings - Fork 0
/
machine_translator.py
executable file
·144 lines (91 loc) · 3.35 KB
/
machine_translator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
import os
import sys
import time
from google.cloud import translate_v2 as translate
from bs4 import BeautifulSoup
from bs4.element import Comment
from parse_translated_content import TranslatedPage
class MachineTranslator():
"""
Class for handling interaction with google translation api.
"""
# Google translate api barfs if text is too long
MAX_TEXT_LEN = 5000
def __init__(self):
"""
Set up the api.
"""
# Instantiate a translation client
self.client = translate.Client.from_service_account_json('google_translate_key.json')
def translate_internal(self, lang, text):
"""
Handle the call to the translation api.
"""
# Translate the content
translation = self.client.translate(text, source_language='en', target_language=lang)
# Sleep 1 second to make sure we don't call google translate too often per second
time.sleep(1)
# print(translation['translatedText'])
return translation['translatedText']
def translate_elt(self, lang, elt):
"""
Traverse the html and translate each tag that has content.
"""
if type(elt) is Comment:
return "\n"
text = str(elt)
output = ""
if text == "\n":
return "\n"
# If the length of the tag is too long, then try recurring on the tag's children.
elif len(text) > MachineTranslator.MAX_TEXT_LEN and elt.children:
first_child_pos = text.find('<', len(elt.name) + 1)
if first_child_pos > 0:
output += text[ : first_child_pos]
for tag in elt.children:
output += self.translate_elt(lang=lang, elt=tag)
output += "</" + elt.name + ">"
else:
# Translate this tag.
output = self.translate_internal(lang=lang, text=text) + "\n"
return output
def translate(self, lang, text):
"""
Translate text to desired language.
"""
if not text:
return ""
soup = BeautifulSoup(text, 'html.parser')
output = ""
for tag in soup.children:
output += self.translate_elt(lang=lang, elt=tag)
return output
if __name__ == '__main__':
# Translate a file into all desired languages.
# Usage: machine_translator.py <file>
# e.g., machine_translator.py 'crio.txt'
if len(sys.argv) != 2:
raise Exception("Usage: filename")
filename = sys.argv[1]
dot_pos = filename.find('.')
# Figure out our output file names and go ahead
# and delete previous output files.
filenames = {}
for lang in TranslatedPage.LANG_MAP.keys():
filename_out = f"{filename[ : dot_pos]}_{lang}.txt"
filenames[lang] = filename_out
try:
os.remove(filename_out)
except FileNotFoundError:
pass
# Now read the input and translate it.
with open(filename) as file_in:
content = file_in.read()
translator = MachineTranslator()
for lang in sorted(filenames.keys()):
output = translator.translate(lang, content)
with open(filenames[lang], "w") as file_out:
file_out.write(output)
print(f"Translated {lang}")
print('\nSaved all translations\n')